-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Evaluated pipelines not containing all models #982
Comments
Thank you for reporting this issue. I checked the demo for reproducing this issue. In this case with
We need fix this bug related to |
I just submit a PR #989 for fixing the bug mentioned in point 1 above. Below is a updated demo for testing this issue (because in this PR import numpy as np
from tpot import TPOTClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# fix random state
np.random.seed(123)
independent=np.random.randint(100,size=1000)
dependent=np.random.randint(2,size=1000)
X_train, X_test, Y_train, Y_test = train_test_split(
independent, dependent, train_size=0.7, test_size=0.3
)
X_train=X_train.reshape(-1,1)
medium_config = {
"sklearn.linear_model.LogisticRegression": {
"penalty": ["l1", "l2"],
"C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0],
"dual": [False],
},
"sklearn.tree.DecisionTreeClassifier": {
"max_depth": range(1, 21),
"min_samples_split": range(2, 21),
"min_samples_leaf": range(1, 21),
},
"sklearn.ensemble.RandomForestClassifier": {
"n_estimators": np.arange(10, 201, 5),
"max_features": np.arange(0.05, 1.01, 0.05),
"min_samples_split": range(2, 21),
"min_samples_leaf": range(1, 21),
"bootstrap": [True, False],
},
"xgboost.XGBClassifier": {
"objective": ['reg:squarederror'],
"n_estimators": np.arange(10, 201, 5),
"max_depth": range(1, 21),
"learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0],
"subsample": np.arange(0.05, 1, 0.05),
"min_child_weight": range(1, 21),
"nthread": [1],
},
# Transformers
"sklearn.preprocessing.Binarizer": {"threshold": np.arange(0.0, 1.01, 0.05)},
"sklearn.preprocessing.MinMaxScaler": {},
"sklearn.preprocessing.RobustScaler": {},
"sklearn.preprocessing.StandardScaler": {},
}
# In[2]:
testC2 = TPOTClassifier(
generations=2,
population_size=30,
verbosity=2,
config_dict=medium_config,
n_jobs=2,
scoring="accuracy",
random_state=123,
use_dask=True,
template="Transformer-Classifier",
warm_start=True,
)
scatter = []
pipelineNames=[]
uniq_ind_count = []
old_inds = []
for i in range(20):
testC2.fit(X_train,Y_train)
all_inds=list(testC2.evaluated_individuals_.items())
tuples = [i for i in all_inds if i not in old_inds] # new pipelines in a iteration
old_inds = all_inds
tuples.sort(key=lambda x: x[1]["internal_cv_score"], reverse=True)
shownModels = []
uniq_ind = []
for ind in tuples:
if not uniq_ind.count(ind[0]):
uniq_ind.append(ind[0])
print('Iteration', i, '# Unique new pipelines',len(uniq_ind))
uniq_ind_count.append(len(uniq_ind))
for x in tuples:
pipeline = x[0]
name = pipeline[: pipeline.find("(")]
if name in shownModels:
continue
if(i==0):
pipelineNames.append(name)
scatter.append([])
shownModels.append(name)
description = x[1]
score = description["internal_cv_score"]
pipelineNames_idx = pipelineNames.index(name)
scatter[pipelineNames_idx].append(score)
# In[6]:
fig,ax=plt.subplots(1,1)
for j in range(len(scatter)):
ax.plot(range(0, len(scatter[j])), scatter[j], "-x", label=pipelineNames[j])
ax.legend(pipelineNames)
plt.show() # In[4]:
print('Number of unique/new pipelines in each iteration', uniq_ind_count)
# In[5]:
print('Total number of unique pipelines in 20 iterations', len(list(testC2.evaluated_individuals_.keys())))
# In[7]:
testC = TPOTClassifier(
generations=2,
population_size=30,
verbosity=2,
config_dict=medium_config,
n_jobs=2,
scoring="accuracy",
random_state=123,
use_dask=True,
template="Classifier",
warm_start=True,
)
scatter = []
pipelineNames=[]
uniq_ind_count = []
old_inds = []
for i in range(20):
testC.fit(X_train,Y_train)
all_inds=list(testC.evaluated_individuals_.items())
tuples = [i for i in all_inds if i not in old_inds] # new pipelines in a iteration
old_inds = all_inds
tuples.sort(key=lambda x: x[1]["internal_cv_score"], reverse=True)
shownModels = []
uniq_ind = []
for ind in tuples:
if not uniq_ind.count(ind[0]):
uniq_ind.append(ind[0])
print('Iteration', i, '# Unique new pipelines',len(uniq_ind))
uniq_ind_count.append(len(uniq_ind))
for x in tuples:
pipeline = x[0]
name = pipeline[: pipeline.find("(")]
if name in shownModels:
continue
if(i==0):
pipelineNames.append(name)
scatter.append([])
shownModels.append(name)
description = x[1]
score = description["internal_cv_score"]
pipelineNames_idx = pipelineNames.index(name)
scatter[pipelineNames_idx].append(score)
# In[8]:
fig,ax=plt.subplots(1,1)
for j in range(len(scatter)):
ax.plot(range(0, len(scatter[j])), scatter[j], "-x", label=pipelineNames[j])
ax.legend(pipelineNames)
plt.show() As the figures above, when template='Classifier', not all models did not appear new evaluated pipelines in > 3 iterations. I think, similar to my guess above, the reason is that points mutations could only happen on transformer step or hyper-parameters of 1-2 classifier in this small-size population since the population became homologous in one or two classifiers with high accuracy scores after a few iterations. |
For testing the demo above, you may install TPOT with patch into your environment via: pip install --upgrade --no-deps --force-reinstall git+https://github.com/EpistasisLab/tpot.git@development Note: scikit-learn may need to be updated to 0.22 for using dev branch. |
Thank you for the fix. I am still unsure though, what does "mutation could happen on transformer step only in this small-size population" mean? And are you planning to fix that problem anytime soon? |
I meant that that points mutations could only happen on transformer step or hyper-parameters of 1-2 classifier in this small-size population since the population in later iteration is full of solutions with only 1-2 classifier. For example, if only XGBClassifier become dominant pipelines with high fitness score in a iteration, the point mutation in large chance should just tune one hyperparameter of XGBClassifier or Transformer (if hyperparameter is available) via one point mutations and switching to new Classifier via point muation should also happen but the chance is lower. And, unless the solutions with new Classifier has a better or similar fitness score, those solutions can not survive to next generation or iteration after selection step in GP. |
Understood, thank you. |
##Context of the issue
So I am trying to "stalk" the score of pipelines during training with warm-start and small runs.
I have managed to do so for both "Classifier" and "Regressor" template, but pipelines start to not appear in evaluated pipelines after I add "Transformer-" to the config
Process to reproduce the issue
Run the following codes and compare testC and testC2's output.
independent=np.random.randint(100,size=1000)
dependent=np.random.randint(2,size=1000)
X_train, X_test, Y_train, Y_test = train_test_split(
independent, dependent, train_size=0.7, test_size=0.3
)
X_train=X_train.reshape(-1,1)
medium_config = {
"sklearn.linear_model.LogisticRegression": {
"penalty": ["l1", "l2"],
"C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0],
"dual": [False],
},
"sklearn.tree.DecisionTreeClassifier": {
"max_depth": range(1, 21),
"min_samples_split": range(2, 21),
"min_samples_leaf": range(1, 21),
},
"sklearn.ensemble.RandomForestClassifier": {
"n_estimators": np.arange(10, 201, 5),
"max_features": np.arange(0.05, 1.01, 0.05),
"min_samples_split": range(2, 21),
"min_samples_leaf": range(1, 21),
"bootstrap": [True, False],
},
"xgboost.XGBClassifier": {
"objective": ['reg:squarederror'],
"n_estimators": np.arange(10, 201, 5),
"max_depth": range(1, 21),
"learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0],
"subsample": np.arange(0.05, 1, 0.05),
"min_child_weight": range(1, 21),
"nthread": [1],
},
# Transformers
"sklearn.preprocessing.Binarizer": {"threshold": np.arange(0.0, 1.01, 0.05)},
"sklearn.preprocessing.MinMaxScaler": {},
"sklearn.preprocessing.RobustScaler": {},
"sklearn.preprocessing.StandardScaler": {},
}
testC = tpotC(
generations=2,
population_size=30,
verbosity=3,
config_dict=medium_config,
n_jobs=2,
scoring="accuracy",
random_state=123,
use_dask=True,
template="Classifier",
warm_start=True,
)
testC2 = tpotC(
generations=2,
population_size=30,
verbosity=3,
config_dict=medium_config,
n_jobs=2,
scoring="accuracy",
random_state=123,
use_dask=True,
template="Transformer-Classifier",
warm_start=True,
)
scatter = []
pipelineNames=[]
#Change testC to testC2 to see the problem
for i in range(20):
testC.fit(X_train,Y_train)
Expected result
template Transformer-Classifier to behave like template Classifier, where every time I request for evaluated_pipelines, all models are in there. Also Regressors for the same thing.
Current result
With Transformer in the template, certain models start to not be included in the evaluated_pipelines_, and the models to be left behind are not constant over different runs.
Possible fix
No idea actually, help!
Screenshot
Classifier
![image](https://user-images.githubusercontent.com/42673559/71504297-df5ea400-28b3-11ea-834f-fbb836f0fe48.png)
Transformer-Classifier
![image](https://user-images.githubusercontent.com/42673559/71504306-e685b200-28b3-11ea-85df-541a4edb8885.png)
Thank you.
The text was updated successfully, but these errors were encountered: