Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deep learning #1025

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/.travis_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
source activate testenv

pip install deap tqdm update_checker stopit \
dask[delayed] dask[dataframe] xgboost cloudpickle==0.5.6 dask_ml==$DASK_ML_VERSION fsspec>=0.3.3
dask[delayed] dask[dataframe] xgboost cloudpickle==0.5.6 dask_ml==$DASK_ML_VERSION fsspec>=0.3.3 tensorflow>=2.1.0

if [[ "$COVERAGE" == "true" ]]; then
pip install coverage coveralls
Expand Down
1 change: 1 addition & 0 deletions optional-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
xgboost==0.90
scikit-mdr==0.4.4
skrebate==0.3.4
tensorflow>=2.1.0
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def calculate_version():
'dask': ['dask>=0.18.2',
'distributed>=1.22.1',
'dask-ml>=1.0.0'],
'tensorflow': ['tensorflow>=2.1.0'],
},
classifiers=[
'Intended Audience :: Science/Research',
Expand Down
38 changes: 38 additions & 0 deletions tests/test_deep_learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import nose
from sklearn.datasets import make_classification, make_regression
from sklearn.neural_network import MLPClassifier, MLPRegressor
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this line since this two seems not to be used in the scripts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pleas also fix those failed unit tests in this test scripts.


from tpot.builtins import DeepLearningTransformer

from tpot.builtins.deep_learning import HAS_TENSORFLOW


if not HAS_TENSORFLOW:
raise nose.SkipTest()


def test_Embedding_Keras():
"""Assert that Embedding for classification works as expecated."""
layer_sizes = [20, 100, 50, 20, 60, 100]
X, y = make_classification(random_state=1)

def check(X, X_transformed, embedding_layer_size):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is duplicated and please only keep one.

assert X.shape[1] + embedding_layer_size == X_transformed.shape[1]

for i in range(len(layer_sizes) - 1):
cs = DeepLearningTransformer(embedding_layer=i)
X_transformed = cs.fit_transform(X=X, y=y)
yield check, X, X_transformed, layer_sizes[i]



layer_sizes = [20, 100, 50, 20, 60, 100]
X, y = make_classification(random_state=1)

def check(X, X_transformed, embedding_layer_size):
assert X.shape[1] + embedding_layer_size == X_transformed.shape[1]

for i in range(len(layer_sizes) - 1):
cs = DeepLearningTransformer(hidden_layer_sizes=layer_sizes, embedding_layer=i)
X_transformed = cs.fit_transform(X=X, y=y)
check(X, X_transformed, layer_sizes[i])
1 change: 1 addition & 0 deletions tpot/builtins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@
from .one_hot_encoder import OneHotEncoder, auto_select_categorical_features, _transform_selected
from .feature_transformers import CategoricalSelector, ContinuousSelector
from .feature_set_selector import FeatureSetSelector
from .deep_learning import DeepLearningTransformer, DeepLearningClassifier, DeepLearningRegressor
223 changes: 223 additions & 0 deletions tpot/builtins/deep_learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""This file is part of the TPOT library.

TPOT was primarily developed at the University of Pennsylvania by:
- Randal S. Olson (rso@randalolson.com)
- Weixuan Fu (weixuanf@upenn.edu)
- Daniel Angell (dpa34@drexel.edu)
- and many more generous open source contributors

TPOT is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.

TPOT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with TPOT. If not, see <http://www.gnu.org/licenses/>.

"""

from functools import partial

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array

try:
import os
os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from tensorflow.keras.models import Model as KerasModel
from tensorflow.keras import backend as keras_backend
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
HAS_TENSORFLOW = True
except ImportError:
HAS_TENSORFLOW = False


DEFAULT_HIDDEN_LAYERS = (100,) # same as sklearn.neural_net

def _build_model(
input_size,
output_size, # num classes for classification or 1 for regression
hidden_layer_sizes=DEFAULT_HIDDEN_LAYERS,
optimizer='adam',
loss='categorical_crossentropy',
kernel_initializer='normal',
kernel_regularizer='l2',
hidden_layer_activation='relu',
output_layer_activation='softmax',
metrics=['accuracy'],
):
if all((0<x<1 for x in hidden_layer_sizes)):
# relative_layer sizes
# use ceil to insure that no layer ends up being < 1
hidden_layer_sizes = np.ceil(np.array(hidden_layer_sizes)*input_size)
elif all((x>1 and isinstance(x, int) for x in hidden_layer_sizes)):
hidden_layer_sizes = np.array(hidden_layer_sizes, dtype=int)
else:
raise ValueError(
"`hidden_layer_sizes` must be an iterable of int x>1 or floats 0<x<1"
)
model = Sequential()
# add input layer
model.add(Dense(
hidden_layer_sizes[0],
activation=hidden_layer_activation,
input_dim=input_size
))
# add hidden layers
for layer_size in hidden_layer_sizes[1:]:
model.add(Dense(
layer_size,
activation=hidden_layer_activation,
kernel_regularizer=kernel_regularizer,
kernel_initializer=kernel_initializer,
))
# add output layer
model.add(Dense(output_size,
activation=output_layer_activation,
kernel_regularizer=kernel_regularizer,
kernel_initializer=kernel_initializer,
))
model.compile(
optimizer=optimizer, loss=loss, metrics=metrics
)
return model

class DeepLearningClassifier(KerasClassifier):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the error in AppVayor, it failed to import tpot due to no tensorflow in the environment. Maybe using HAS_TENSORFLOW to determine if those new classes should be created.

_build_model = partial(
_build_model,
output_layer_activation='softmax',
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'],
)

def __init__(self, **sk_params):
# crete a prototype with empty input/output sizes
# this is needed for sk_params to be processed correctly
self.__call__ = partial(
self._build_model,
input_size=None,
output_size=None
)
super().__init__(**sk_params)

def fit(self, X, y=None, **fit_params):
self.__call__ = partial(
self._build_model,
input_size=X.shape[1],
output_size=np.unique(y).size
)
self.history = super().fit(X, y, **fit_params)
return self

class DeepLearningRegressor(KerasRegressor):
_build_model = partial(
_build_model,
output_layer_activation='linear',
optimizer='adam',
loss='mean_squared_error',
metrics=['mean_squared_error'],
)

def __init__(self, **sk_params):
# crete a prototype with empty input/output sizes
# this is needed for sk_params to be processed correctly
self.__call__ = partial(
self._build_model,
input_size=None,
output_size=None
)
super().__init__(**sk_params)

def fit(self, X, y=None, **fit_params):
self.__call__ = partial(
self._build_model,
input_size=X.shape[1],
output_size=1
)
self.history = super().fit(X, y, **fit_params)
return self

class DeepLearningTransformer(TransformerMixin, DeepLearningClassifier):
"""Meta-transformer for creating neural network embeddings as features.
"""

def __init__(self, embedding_layer=-2, **sk_params):
"""Create a StackingEstimator object.

Parameters
----------
estimator: the Keras neural network model used to generate embeddings.
embedding_layer: the particular layer used as the embedding.
By default we use the second last layer. Layers are counted with
input layer being `0th` layer; negative indices are allowed.
backend: (optional), the backend we use to query the neural network.
Currently only supports keras-like interface (incl. tensorflow)
"""
# validate embedding_layer
if 'hidden_layer_sizes' in sk_params:
test_list = sk_params['hidden_layer_sizes']
else:
test_list = DEFAULT_HIDDEN_LAYERS
assert embedding_layer not in (-1, len(test_list)-1),\
"Can not use output layer for embedding"
assert embedding_layer not in (0, -len(test_list)),\
"Can not use input layer for embedding"
try:
test_list[embedding_layer]
except IndexError:
raise ValueError(
f"`embedding_layer` ({embedding_layer}) is not a valid index"
f" of `hidden_layer_sizes` ({test_list})"
)
if embedding_layer < 0:
self.embedding_layer = embedding_layer - 1 # adjust for output layer
else:
self.embedding_layer = embedding_layer
super().__init__(**sk_params)

def fit(self, *args, **kwargs):
model = super().fit(*args, **kwargs)
self._model = model
return model

def transform(self, X):
"""Transform data by adding embedding as features.

Parameters
----------
X: numpy ndarray, {n_samples, n_components}
New data, where n_samples is the number of samples and n_components is the number of components.

Returns
-------
X_transformed: array-like, shape (n_samples, n_features + embedding) where embedding is the size of the embedding layer
The transformed feature set.
"""
X = check_array(X)
X_transformed = np.copy(X)
# add class probabilities as a synthetic feature
X_transformed = np.hstack((self._embedding_keras(X), X_transformed))

return X_transformed

def _embedding_keras(self, X):
X = check_array(X, accept_sparse=["csr", "csc", "coo"])
get_embedding = keras_backend.function(
[self._model.model.layers[0].input],
[self._model.model.layers[self.embedding_layer].output],
)
return get_embedding([X])[0]