diff --git a/.coveragerc b/.coveragerc index c43aa0a2..a7373807 100755 --- a/.coveragerc +++ b/.coveragerc @@ -1,4 +1,19 @@ [run] +branch = True +source = afqinsight omit = - afqinsight/tests/* - afqinsight/_version.py + tests/*.py + */setup.py + +[report] +exclude_lines = + pragma: no cover + def __repr__ + if self.debug: + if settings.DEBUG + raise AssertionError + raise NotImplementedError + if 0: + if __name__ == .__main__.: + if self.verbose: +show_missing = True \ No newline at end of file diff --git a/afqinsight/_serial_bagging.py b/afqinsight/_serial_bagging.py new file mode 100644 index 00000000..9bf6d76d --- /dev/null +++ b/afqinsight/_serial_bagging.py @@ -0,0 +1,885 @@ +"""Serial execution versions of the sklearn bagging estimators. + +This private module is here because I struggle with nested parallelism when +using dask.distruted as a parallel backend. In order to expose parallelism at +the base estimator level, I've found that I need meta-estimators and +cross-validation functions to be run in serial. This is unnecessary when +using the default joblib backends, but becomes necessary when using +dask.distributed. If someone can tell me how to fully exploit nested +parallelism when using a dask.distributed backend, I will gladly remove this +private module. @richford +""" +import itertools +import numbers +import numpy as np +from warnings import warn + +from sklearn.ensemble import BaggingClassifier, BaggingRegressor +from sklearn.ensemble._bagging import ( + _parallel_build_estimators, + _parallel_predict_proba, + _parallel_predict_log_proba, + _parallel_decision_function, + _parallel_predict_regression, +) +from sklearn.ensemble._base import _partition_estimators +from sklearn.utils import check_random_state, check_array +from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils.validation import check_is_fitted, _check_sample_weight + + +__all__ = ["SerialBaggingClassifier"] + +MAX_INT = np.iinfo(np.int32).max + + +class SerialBaggingClassifier(BaggingClassifier): + """A Clone of sklearn.ensemble.BaggingClassifier with serial execution. + + A Bagging classifier is an ensemble meta-estimator that fits base + classifiers each on random subsets of the original dataset and then + aggregate their individual predictions (either by voting or by averaging) + to form a final prediction. Such a meta-estimator can typically be used as + a way to reduce the variance of a black-box estimator (e.g., a decision + tree), by introducing randomization into its construction procedure and + then making an ensemble out of it. + + This algorithm encompasses several works from the literature. When random + subsets of the dataset are drawn as random subsets of the samples, then + this algorithm is known as Pasting [1]_. If samples are drawn with + replacement, then the method is known as Bagging [2]_. When random subsets + of the dataset are drawn as random subsets of the features, then the method + is known as Random Subspaces [3]_. Finally, when base estimators are built + on subsets of both samples and features, then the method is known as + Random Patches [4]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + base_estimator : object, default=None + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a decision tree. + + n_estimators : int, default=10 + The number of base estimators in the ensemble. + + max_samples : int or float, default=1.0 + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). + + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + + bootstrap : bool, default=True + Whether samples are drawn with replacement. If False, sampling + without replacement is performed. + + bootstrap_features : bool, default=False + Whether features are drawn with replacement. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate + the generalization error. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit + a whole new ensemble. See :term:`the Glossary `. + + n_jobs : int, default=None + This parameter has no effect but is kept for conformity with the + parent class. + + random_state : int or RandomState, default=None + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + Attributes + ---------- + base_estimator_ : estimator + The base estimator from which the ensemble is grown. + + n_features_ : int + The number of features when :meth:`fit` is performed. + + estimators_ : list of estimators + The collection of fitted base estimators. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_classes_ : int or list + The number of classes. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_decision_function_ : ndarray of shape (n_samples, n_classes) + Decision function computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_decision_function_` might contain NaN. This attribute exists + only when ``oob_score`` is True. + + References + ---------- + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, + 1996. + + .. [3] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, + 1998. + + .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + """ + + def __init__( + self, + base_estimator=None, + n_estimators=10, + *, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + n_jobs=None, + random_state=None, + verbose=0, + ): + super().__init__( + base_estimator=base_estimator, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + bootstrap=bootstrap, + bootstrap_features=bootstrap_features, + oob_score=oob_score, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): + """Build a Bagging ensemble of estimators from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + y : array-like of shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + + max_samples : int or float, default=None + Argument to use instead of self.max_samples. + + max_depth : int, default=None + Override value used when constructing base estimator. Only + supported if the base estimator has a max_depth parameter. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if the base estimator supports + sample weighting. + + Returns + ------- + self : object + """ + random_state = check_random_state(self.random_state) + + # Convert data (X is required to be 2d and indexable) + X, y = self._validate_data( + X, + y, + accept_sparse=["csr", "csc"], + dtype=None, + force_all_finite=False, + multi_output=True, + ) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=None) + + # Remap output + n_samples, self.n_features_ = X.shape + self._n_samples = n_samples + y = self._validate_y(y) + + # Check parameters + self._validate_estimator() + + if max_depth is not None: # pragma: no cover + self.base_estimator_.max_depth = max_depth + + # Validate max_samples + if max_samples is None: # pragma: no cover + max_samples = self.max_samples + elif not isinstance(max_samples, numbers.Integral): + max_samples = int(max_samples * X.shape[0]) + + if not (0 < max_samples <= X.shape[0]): + raise ValueError("max_samples must be in (0, n_samples]") + + # Store validated integer row sampling value + self._max_samples = max_samples + + # Validate max_features + if isinstance(self.max_features, numbers.Integral): + max_features = self.max_features + elif isinstance(self.max_features, np.float): + max_features = self.max_features * self.n_features_ + else: + raise ValueError("max_features must be int or float") + + if not (0 < max_features <= self.n_features_): + raise ValueError("max_features must be in (0, n_features]") + + max_features = max(1, int(max_features)) + + # Store validated integer feature sampling value + self._max_features = max_features + + # Other checks + if not self.bootstrap and self.oob_score: # pragma: no cover + raise ValueError( + "Out of bag estimation only available" " if bootstrap=True" + ) + + if self.warm_start and self.oob_score: + raise ValueError( + "Out of bag estimate only available" " if warm_start=False" + ) + + if hasattr(self, "oob_score_") and self.warm_start: + del self.oob_score_ + + if not self.warm_start or not hasattr(self, "estimators_"): + # Free allocated memory, if any + self.estimators_ = [] + self.estimators_features_ = [] + + n_more_estimators = self.n_estimators - len(self.estimators_) + + if n_more_estimators < 0: + raise ValueError( + "n_estimators=%d must be larger or equal to " + "len(estimators_)=%d when warm_start==True" + % (self.n_estimators, len(self.estimators_)) + ) + + elif n_more_estimators == 0: + warn( + "Warm-start fitting without increasing n_estimators does not " + "fit new trees." + ) + return self + + # Partition the estimators + n_jobs, n_estimators, starts = _partition_estimators( + n_more_estimators, self.n_jobs + ) + total_n_estimators = sum(n_estimators) + + # Advance random state to state after training + # the first n_estimators + if self.warm_start and len(self.estimators_) > 0: + random_state.randint(MAX_INT, size=len(self.estimators_)) + + seeds = random_state.randint(MAX_INT, size=n_more_estimators) + self._seeds = seeds + + all_results = [ + _parallel_build_estimators( + n_estimators[i], + self, + X, + y, + sample_weight, + seeds[starts[i] : starts[i + 1]], + total_n_estimators, + verbose=self.verbose, + ) + for i in range(n_jobs) + ] + + # Reduce + self.estimators_ += list( + itertools.chain.from_iterable(t[0] for t in all_results) + ) + self.estimators_features_ += list( + itertools.chain.from_iterable(t[1] for t in all_results) + ) + + if self.oob_score: + self._set_oob_score(X, y) + + return self + + def predict_proba(self, X): + """Predict class probabilities for X. + + The predicted class probabilities of an input sample is computed as + the mean predicted class probabilities of the base estimators in the + ensemble. If base estimators do not implement a ``predict_proba`` + method, then it resorts to voting and the predicted class probabilities + of an input sample represents the proportion of estimators predicting + each class. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + # Check data + X = check_array( + X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False + ) + + if self.n_features_ != X.shape[1]: + raise ValueError( + "Number of features of the model must " + "match the input. Model n_features is {0} and " + "input n_features is {1}." + "".format(self.n_features_, X.shape[1]) + ) + + # Partition the estimators + n_jobs, n_estimators, starts = _partition_estimators( + self.n_estimators, self.n_jobs + ) + + all_proba = [ + _parallel_predict_proba( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + self.n_classes_, + ) + for i in range(n_jobs) + ] + + # Reduce + proba = sum(all_proba) / self.n_estimators + + return proba + + def predict_log_proba(self, X): + """Predict class log-probabilities for X. + + The predicted class log-probabilities of an input sample is computed as + the log of the mean predicted class probabilities of the base + estimators in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class log-probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + if hasattr(self.base_estimator_, "predict_log_proba"): + # Check data + X = check_array( + X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False + ) + + if self.n_features_ != X.shape[1]: + raise ValueError( + "Number of features of the model must " + "match the input. Model n_features is {0} " + "and input n_features is {1} " + "".format(self.n_features_, X.shape[1]) + ) + + # Partition the estimators + n_jobs, n_estimators, starts = _partition_estimators( + self.n_estimators, self.n_jobs + ) + + all_log_proba = [ + _parallel_predict_log_proba( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + self.n_classes_, + ) + for i in range(n_jobs) + ] + + # Reduce + log_proba = all_log_proba[0] + + for j in range(1, len(all_log_proba)): + log_proba = np.logaddexp(log_proba, all_log_proba[j]) + + log_proba -= np.log(self.n_estimators) + + return log_proba + + else: + return np.log(self.predict_proba(X)) + + @if_delegate_has_method(delegate="base_estimator") + def decision_function(self, X): + """Average of the decision functions of the base classifiers. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + score : ndarray of shape (n_samples, k) + The decision function of the input samples. The columns correspond + to the classes in sorted order, as they appear in the attribute + ``classes_``. Regression and binary classification are special + cases with ``k == 1``, otherwise ``k==n_classes``. + + """ + check_is_fitted(self) + + # Check data + X = check_array( + X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False + ) + + if self.n_features_ != X.shape[1]: + raise ValueError( + "Number of features of the model must " + "match the input. Model n_features is {0} and " + "input n_features is {1} " + "".format(self.n_features_, X.shape[1]) + ) + + # Partition the estimators + n_jobs, n_estimators, starts = _partition_estimators( + self.n_estimators, self.n_jobs + ) + + all_decisions = [ + _parallel_decision_function( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + ) + for i in range(n_jobs) + ] + + # Reduce + decisions = sum(all_decisions) / self.n_estimators + + return decisions + + +class SerialBaggingRegressor(BaggingRegressor): + """A Clone of sklearn.ensemble.BaggingRegressor with serial execution. + + A Bagging regressor is an ensemble meta-estimator that fits base + regressors each on random subsets of the original dataset and then + aggregate their individual predictions (either by voting or by averaging) + to form a final prediction. Such a meta-estimator can typically be used as + a way to reduce the variance of a black-box estimator (e.g., a decision + tree), by introducing randomization into its construction procedure and + then making an ensemble out of it. + + This algorithm encompasses several works from the literature. When random + subsets of the dataset are drawn as random subsets of the samples, then + this algorithm is known as Pasting [1]_. If samples are drawn with + replacement, then the method is known as Bagging [2]_. When random subsets + of the dataset are drawn as random subsets of the features, then the method + is known as Random Subspaces [3]_. Finally, when base estimators are built + on subsets of both samples and features, then the method is known as + Random Patches [4]_. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.15 + + Parameters + ---------- + base_estimator : object, default=None + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a decision tree. + + n_estimators : int, default=10 + The number of base estimators in the ensemble. + + max_samples : int or float, default=1.0 + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). + + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + + bootstrap : bool, default=True + Whether samples are drawn with replacement. If False, sampling + without replacement is performed. + + bootstrap_features : bool, default=False + Whether features are drawn with replacement. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate + the generalization error. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit + a whole new ensemble. See :term:`the Glossary `. + + n_jobs : int, default=None + This parameter has no effect but is kept for conformity with the + parent class. + + random_state : int or RandomState, default=None + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + Attributes + ---------- + base_estimator_ : estimator + The base estimator from which the ensemble is grown. + + n_features_ : int + The number of features when :meth:`fit` is performed. + + estimators_ : list of estimators + The collection of fitted sub-estimators. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_prediction_ : ndarray of shape (n_samples,) + Prediction computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_prediction_` might contain NaN. This attribute exists only + when ``oob_score`` is True. + + Examples + -------- + >>> from sklearn.svm import SVR + >>> from sklearn.ensemble import BaggingRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_samples=100, n_features=4, + ... n_informative=2, n_targets=1, + ... random_state=0, shuffle=False) + >>> regr = BaggingRegressor(base_estimator=SVR(), + ... n_estimators=10, random_state=0).fit(X, y) + >>> regr.predict([[0, 0, 0, 0]]) + array([-2.8720...]) + + References + ---------- + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, + 1996. + + .. [3] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, + 1998. + + .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + """ + + def __init__( + self, + base_estimator=None, + n_estimators=10, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + n_jobs=None, + random_state=None, + verbose=0, + ): + super().__init__( + base_estimator=base_estimator, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + bootstrap=bootstrap, + bootstrap_features=bootstrap_features, + oob_score=oob_score, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): + """Build a Bagging ensemble of estimators from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + y : array-like of shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + + max_samples : int or float, default=None + Argument to use instead of self.max_samples. + + max_depth : int, default=None + Override value used when constructing base estimator. Only + supported if the base estimator has a max_depth parameter. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if the base estimator supports + sample weighting. + + Returns + ------- + self : object + """ + random_state = check_random_state(self.random_state) + + # Convert data (X is required to be 2d and indexable) + X, y = self._validate_data( + X, + y, + accept_sparse=["csr", "csc"], + dtype=None, + force_all_finite=False, + multi_output=True, + ) + if sample_weight is not None: # pragma: no cover + sample_weight = _check_sample_weight(sample_weight, X, dtype=None) + + # Remap output + n_samples, self.n_features_ = X.shape + self._n_samples = n_samples + y = self._validate_y(y) + + # Check parameters + self._validate_estimator() + + if max_depth is not None: # pragma: no cover + self.base_estimator_.max_depth = max_depth + + # Validate max_samples + if max_samples is None: # pragma: no cover + max_samples = self.max_samples + elif not isinstance(max_samples, numbers.Integral): # pragma: no cover + max_samples = int(max_samples * X.shape[0]) + + if not (0 < max_samples <= X.shape[0]): # pragma: no cover + raise ValueError("max_samples must be in (0, n_samples]") + + # Store validated integer row sampling value + self._max_samples = max_samples + + # Validate max_features + if isinstance(self.max_features, numbers.Integral): + max_features = self.max_features + elif isinstance(self.max_features, np.float): # pragma: no cover + max_features = self.max_features * self.n_features_ + else: # pragma: no cover + raise ValueError("max_features must be int or float") + + if not (0 < max_features <= self.n_features_): # pragma: no cover + raise ValueError("max_features must be in (0, n_features]") + + max_features = max(1, int(max_features)) + + # Store validated integer feature sampling value + self._max_features = max_features + + # Other checks + if not self.bootstrap and self.oob_score: # pragma: no cover + raise ValueError( + "Out of bag estimation only available" " if bootstrap=True" + ) + + if self.warm_start and self.oob_score: # pragma: no cover + raise ValueError( + "Out of bag estimate only available" " if warm_start=False" + ) + + if hasattr(self, "oob_score_") and self.warm_start: # pragma: no cover + del self.oob_score_ + + if not self.warm_start or not hasattr(self, "estimators_"): # pragma: no cover + # Free allocated memory, if any + self.estimators_ = [] + self.estimators_features_ = [] + + n_more_estimators = self.n_estimators - len(self.estimators_) + + if n_more_estimators < 0: # pragma: no cover + raise ValueError( + "n_estimators=%d must be larger or equal to " + "len(estimators_)=%d when warm_start==True" + % (self.n_estimators, len(self.estimators_)) + ) + + elif n_more_estimators == 0: # pragma: no cover + warn( + "Warm-start fitting without increasing n_estimators does not " + "fit new trees." + ) + return self + + # Partition the estimators + n_jobs, n_estimators, starts = _partition_estimators( + n_more_estimators, self.n_jobs + ) + total_n_estimators = sum(n_estimators) + + # Advance random state to state after training + # the first n_estimators + if self.warm_start and len(self.estimators_) > 0: # pragma: no cover + random_state.randint(MAX_INT, size=len(self.estimators_)) + + seeds = random_state.randint(MAX_INT, size=n_more_estimators) + self._seeds = seeds + + all_results = [ + _parallel_build_estimators( + n_estimators[i], + self, + X, + y, + sample_weight, + seeds[starts[i] : starts[i + 1]], + total_n_estimators, + verbose=self.verbose, + ) + for i in range(n_jobs) + ] + + # Reduce + self.estimators_ += list( + itertools.chain.from_iterable(t[0] for t in all_results) + ) + self.estimators_features_ += list( + itertools.chain.from_iterable(t[1] for t in all_results) + ) + + if self.oob_score: + self._set_oob_score(X, y) + + return self + + def predict(self, X): + """Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the estimators in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted values. + """ + check_is_fitted(self) + # Check data + X = check_array( + X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False + ) + + # Partition the estimators + n_jobs, n_estimators, starts = _partition_estimators( + self.n_estimators, self.n_jobs + ) + + all_y_hat = [ + _parallel_predict_regression( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + ) + for i in range(n_jobs) + ] + + # Reduce + y_hat = sum(all_y_hat) / self.n_estimators + + return y_hat diff --git a/afqinsight/pipeline.py b/afqinsight/pipeline.py index 92dc1929..f583feb9 100755 --- a/afqinsight/pipeline.py +++ b/afqinsight/pipeline.py @@ -18,6 +18,8 @@ from sklearn.preprocessing import PowerTransformer from string import Template +from ._serial_bagging import SerialBaggingClassifier, SerialBaggingRegressor + __all__ = ["make_afq_classifier_pipeline", "make_afq_regressor_pipeline"] @@ -107,7 +109,7 @@ def make_base_afq_pipeline( The estimator to use as the last step of the pipeline. If provided, it must inherit from :class:`sklearn:sklearn.base.BaseEstimator` - ensemble_meta_estimator : "bagging", "adaboost", or None + ensemble_meta_estimator : "bagging", "adaboost", "serial-bagging", or None An optional ensemble meta-estimator to combine the predictions of several base estimators. "Adaboost" will result in the use of :class:`sklearn:sklearn.ensemble.AdaBoostClassifier` for classifier @@ -259,7 +261,7 @@ def call_with_kwargs(Transformer, kwargs): base_estimator = call_with_kwargs(estimator, estimator_kwargs) if ensemble_meta_estimator is not None: - allowed = ["bagging", "adaboost"] + allowed = ["bagging", "adaboost", "serial-bagging"] err_msg = Template( ensembler_msg.safe_substitute( kw="ensemble_meta_estimator", allowed=allowed @@ -282,6 +284,15 @@ def call_with_kwargs(Transformer, kwargs): ensembler = call_with_kwargs( BaggingRegressor, ensembler_kwargs ) + elif ensemble_meta_estimator.lower() == "serial-bagging": + if is_classifier(base_estimator): + ensembler = call_with_kwargs( + SerialBaggingClassifier, ensembler_kwargs + ) + elif is_regressor(base_estimator): + ensembler = call_with_kwargs( + SerialBaggingRegressor, ensembler_kwargs + ) elif ensemble_meta_estimator.lower() == "adaboost": if is_classifier(base_estimator): ensembler = call_with_kwargs( diff --git a/afqinsight/tests/test_bagging.py b/afqinsight/tests/test_bagging.py new file mode 100644 index 00000000..a89fdd7a --- /dev/null +++ b/afqinsight/tests/test_bagging.py @@ -0,0 +1,953 @@ +""" +Testing for the serial bagging ensemble module (afqinsight._serial_bagging). +""" + +# Author: Gilles Louppe +# License: BSD 3 clause + +import numpy as np +import joblib + +from afqinsight._serial_bagging import SerialBaggingClassifier, SerialBaggingRegressor + +from sklearn.base import BaseEstimator + +from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_raises +from sklearn.utils._testing import assert_warns +from sklearn.utils._testing import assert_warns_message +from sklearn.utils._testing import assert_raise_message +from sklearn.utils._testing import ignore_warnings + +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.model_selection import GridSearchCV, ParameterGrid +from sklearn.linear_model import Perceptron, LogisticRegression +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.svm import SVC, SVR +from sklearn.random_projection import SparseRandomProjection +from sklearn.pipeline import make_pipeline +from sklearn.feature_selection import SelectKBest +from sklearn.model_selection import train_test_split +from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 +from sklearn.utils import check_random_state +from sklearn.preprocessing import FunctionTransformer + +from scipy.sparse import csc_matrix, csr_matrix + +rng = check_random_state(0) + +# also load the iris dataset +# and randomly permute it +iris = load_iris() +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + +# also load the diabetes dataset +# and randomly permute it +diabetes = load_diabetes() +perm = rng.permutation(diabetes.target.size) +diabetes.data = diabetes.data[perm] +diabetes.target = diabetes.target[perm] + + +# TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates +@ignore_warnings(category=FutureWarning) +def test_classification(): + # Check classification for various parameter settings. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + grid = ParameterGrid( + { + "max_samples": [0.5, 1.0], + "max_features": [1, 2, 4], + "bootstrap": [True, False], + "bootstrap_features": [True, False], + } + ) + + for base_estimator in [ + None, + DummyClassifier(), + Perceptron(), + DecisionTreeClassifier(), + KNeighborsClassifier(), + SVC(), + ]: + for params in grid: + SerialBaggingClassifier( + base_estimator=base_estimator, random_state=rng, **params + ).fit(X_train, y_train).predict(X_test) + + +def test_sparse_classification(): + # Check classification for various parameter settings on sparse input. + + class CustomSVC(SVC): + """SVC variant that records the nature of the training set""" + + def fit(self, X, y): + super().fit(X, y) + self.data_type_ = type(X) + return self + + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + parameter_sets = [ + { + "max_samples": 0.5, + "max_features": 2, + "bootstrap": True, + "bootstrap_features": True, + }, + { + "max_samples": 1.0, + "max_features": 4, + "bootstrap": True, + "bootstrap_features": True, + }, + {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, + {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, + ] + + for sparse_format in [csc_matrix, csr_matrix]: + X_train_sparse = sparse_format(X_train) + X_test_sparse = sparse_format(X_test) + for params in parameter_sets: + for f in [ + "predict", + "predict_proba", + "predict_log_proba", + "decision_function", + ]: + # Trained on sparse format + sparse_classifier = SerialBaggingClassifier( + base_estimator=CustomSVC(decision_function_shape="ovr"), + random_state=1, + **params, + ).fit(X_train_sparse, y_train) + sparse_results = getattr(sparse_classifier, f)(X_test_sparse) + + # Trained on dense format + dense_classifier = SerialBaggingClassifier( + base_estimator=CustomSVC(decision_function_shape="ovr"), + random_state=1, + **params, + ).fit(X_train, y_train) + dense_results = getattr(dense_classifier, f)(X_test) + assert_array_almost_equal(sparse_results, dense_results) + + sparse_type = type(X_train_sparse) + types = [i.data_type_ for i in sparse_classifier.estimators_] + + assert all([t == sparse_type for t in types]) + + +def test_regression(): + # Check regression for various parameter settings. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data[:50], diabetes.target[:50], random_state=rng + ) + grid = ParameterGrid( + { + "max_samples": [0.5, 1.0], + "max_features": [0.5, 1.0], + "bootstrap": [True, False], + "bootstrap_features": [True, False], + } + ) + + for base_estimator in [ + None, + DummyRegressor(), + DecisionTreeRegressor(), + KNeighborsRegressor(), + SVR(), + ]: + for params in grid: + SerialBaggingRegressor( + base_estimator=base_estimator, random_state=rng, **params + ).fit(X_train, y_train).predict(X_test) + + +def test_sparse_regression(): + # Check regression for various parameter settings on sparse input. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data[:50], diabetes.target[:50], random_state=rng + ) + + class CustomSVR(SVR): + """SVC variant that records the nature of the training set""" + + def fit(self, X, y): + super().fit(X, y) + self.data_type_ = type(X) + return self + + parameter_sets = [ + { + "max_samples": 0.5, + "max_features": 2, + "bootstrap": True, + "bootstrap_features": True, + }, + { + "max_samples": 1.0, + "max_features": 4, + "bootstrap": True, + "bootstrap_features": True, + }, + {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, + {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, + ] + + for sparse_format in [csc_matrix, csr_matrix]: + X_train_sparse = sparse_format(X_train) + X_test_sparse = sparse_format(X_test) + for params in parameter_sets: + + # Trained on sparse format + sparse_classifier = SerialBaggingRegressor( + base_estimator=CustomSVR(), random_state=1, **params + ).fit(X_train_sparse, y_train) + sparse_results = sparse_classifier.predict(X_test_sparse) + + # Trained on dense format + dense_results = ( + SerialBaggingRegressor( + base_estimator=CustomSVR(), random_state=1, **params + ) + .fit(X_train, y_train) + .predict(X_test) + ) + + sparse_type = type(X_train_sparse) + types = [i.data_type_ for i in sparse_classifier.estimators_] + + assert_array_almost_equal(sparse_results, dense_results) + assert all([t == sparse_type for t in types]) + assert_array_almost_equal(sparse_results, dense_results) + + +class DummySizeEstimator(BaseEstimator): + def fit(self, X, y): + self.training_size_ = X.shape[0] + self.training_hash_ = joblib.hash(X) + + +def test_bootstrap_samples(): + # Test that bootstrapping samples generate non-perfect base estimators. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + base_estimator = DecisionTreeRegressor().fit(X_train, y_train) + + # without bootstrap, all trees are perfect on the training set + ensemble = SerialBaggingRegressor( + base_estimator=DecisionTreeRegressor(), + max_samples=1.0, + bootstrap=False, + random_state=rng, + ).fit(X_train, y_train) + + assert base_estimator.score(X_train, y_train) == ensemble.score(X_train, y_train) + + # with bootstrap, trees are no longer perfect on the training set + ensemble = SerialBaggingRegressor( + base_estimator=DecisionTreeRegressor(), + max_samples=1.0, + bootstrap=True, + random_state=rng, + ).fit(X_train, y_train) + + assert base_estimator.score(X_train, y_train) > ensemble.score(X_train, y_train) + + # check that each sampling correspond to a complete bootstrap resample. + # the size of each bootstrap should be the same as the input data but + # the data should be different (checked using the hash of the data). + ensemble = SerialBaggingRegressor( + base_estimator=DummySizeEstimator(), bootstrap=True + ).fit(X_train, y_train) + training_hash = [] + for estimator in ensemble.estimators_: + assert estimator.training_size_ == X_train.shape[0] + training_hash.append(estimator.training_hash_) + assert len(set(training_hash)) == len(training_hash) + + +def test_bootstrap_features(): + # Test that bootstrapping features may generate duplicate features. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + ensemble = SerialBaggingRegressor( + base_estimator=DecisionTreeRegressor(), + max_features=1.0, + bootstrap_features=False, + random_state=rng, + ).fit(X_train, y_train) + + for features in ensemble.estimators_features_: + assert diabetes.data.shape[1] == np.unique(features).shape[0] + + ensemble = SerialBaggingRegressor( + base_estimator=DecisionTreeRegressor(), + max_features=1.0, + bootstrap_features=True, + random_state=rng, + ).fit(X_train, y_train) + + for features in ensemble.estimators_features_: + assert diabetes.data.shape[1] > np.unique(features).shape[0] + + +def test_probability(): + # Predict probabilities. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + + with np.errstate(divide="ignore", invalid="ignore"): + # Normal case + ensemble = SerialBaggingClassifier( + base_estimator=DecisionTreeClassifier(), random_state=rng + ).fit(X_train, y_train) + + assert_array_almost_equal( + np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)) + ) + + assert_array_almost_equal( + ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)) + ) + + # Degenerate case, where some classes are missing + ensemble = SerialBaggingClassifier( + base_estimator=LogisticRegression(), random_state=rng, max_samples=5 + ).fit(X_train, y_train) + + assert_array_almost_equal( + np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)) + ) + + assert_array_almost_equal( + ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)) + ) + + +def test_oob_score_classification(): + # Check that oob prediction is a good estimation of the generalization + # error. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + + for base_estimator in [DecisionTreeClassifier(), SVC()]: + clf = SerialBaggingClassifier( + base_estimator=base_estimator, + n_estimators=100, + bootstrap=True, + oob_score=True, + random_state=rng, + ).fit(X_train, y_train) + + test_score = clf.score(X_test, y_test) + + assert abs(test_score - clf.oob_score_) < 0.1 + + # Test with few estimators + assert_warns( + UserWarning, + SerialBaggingClassifier( + base_estimator=base_estimator, + n_estimators=1, + bootstrap=True, + oob_score=True, + random_state=rng, + ).fit, + X_train, + y_train, + ) + + +def test_oob_score_regression(): + # Check that oob prediction is a good estimation of the generalization + # error. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + clf = SerialBaggingRegressor( + base_estimator=DecisionTreeRegressor(), + n_estimators=50, + bootstrap=True, + oob_score=True, + random_state=rng, + ).fit(X_train, y_train) + + test_score = clf.score(X_test, y_test) + + assert abs(test_score - clf.oob_score_) < 0.1 + + # Test with few estimators + assert_warns( + UserWarning, + SerialBaggingRegressor( + base_estimator=DecisionTreeRegressor(), + n_estimators=1, + bootstrap=True, + oob_score=True, + random_state=rng, + ).fit, + X_train, + y_train, + ) + + +def test_single_estimator(): + # Check singleton ensembles. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + clf1 = SerialBaggingRegressor( + base_estimator=KNeighborsRegressor(), + n_estimators=1, + bootstrap=False, + bootstrap_features=False, + random_state=rng, + ).fit(X_train, y_train) + + clf2 = KNeighborsRegressor().fit(X_train, y_train) + + assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test)) + + +def test_error(): + # Test that it gives proper exception on deficient input. + X, y = iris.data, iris.target + base = DecisionTreeClassifier() + + # Test max_samples + assert_raises(ValueError, SerialBaggingClassifier(base, max_samples=-1).fit, X, y) + assert_raises(ValueError, SerialBaggingClassifier(base, max_samples=0.0).fit, X, y) + assert_raises(ValueError, SerialBaggingClassifier(base, max_samples=2.0).fit, X, y) + assert_raises(ValueError, SerialBaggingClassifier(base, max_samples=1000).fit, X, y) + assert_raises( + ValueError, SerialBaggingClassifier(base, max_samples="foobar").fit, X, y + ) + + # Test max_features + assert_raises(ValueError, SerialBaggingClassifier(base, max_features=-1).fit, X, y) + assert_raises(ValueError, SerialBaggingClassifier(base, max_features=0.0).fit, X, y) + assert_raises(ValueError, SerialBaggingClassifier(base, max_features=2.0).fit, X, y) + assert_raises(ValueError, SerialBaggingClassifier(base, max_features=5).fit, X, y) + assert_raises( + ValueError, SerialBaggingClassifier(base, max_features="foobar").fit, X, y + ) + + # Test support of decision_function + assert not hasattr(SerialBaggingClassifier(base).fit(X, y), "decision_function") + + +def test_parallel_classification(): + # Check parallel classification. + rng = check_random_state(0) + + # Classification + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + + ensemble = SerialBaggingClassifier( + DecisionTreeClassifier(), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + # predict_proba + ensemble.set_params(n_jobs=1) + y1 = ensemble.predict_proba(X_test) + ensemble.set_params(n_jobs=2) + y2 = ensemble.predict_proba(X_test) + assert_array_almost_equal(y1, y2) + + ensemble = SerialBaggingClassifier( + DecisionTreeClassifier(), n_jobs=1, random_state=0 + ).fit(X_train, y_train) + + y3 = ensemble.predict_proba(X_test) + assert_array_almost_equal(y1, y3) + + # decision_function + ensemble = SerialBaggingClassifier( + SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + ensemble.set_params(n_jobs=1) + decisions1 = ensemble.decision_function(X_test) + ensemble.set_params(n_jobs=2) + decisions2 = ensemble.decision_function(X_test) + assert_array_almost_equal(decisions1, decisions2) + + X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) + assert_raise_message( + ValueError, + "Number of features of the model " + "must match the input. Model n_features is {0} " + "and input n_features is {1} " + "".format(X_test.shape[1], X_err.shape[1]), + ensemble.decision_function, + X_err, + ) + + ensemble = SerialBaggingClassifier( + SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0 + ).fit(X_train, y_train) + + decisions3 = ensemble.decision_function(X_test) + assert_array_almost_equal(decisions1, decisions3) + + +def test_parallel_regression(): + # Check parallel regression. + rng = check_random_state(0) + + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + ensemble = SerialBaggingRegressor( + DecisionTreeRegressor(), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + ensemble.set_params(n_jobs=1) + y1 = ensemble.predict(X_test) + ensemble.set_params(n_jobs=2) + y2 = ensemble.predict(X_test) + assert_array_almost_equal(y1, y2) + + ensemble = SerialBaggingRegressor( + DecisionTreeRegressor(), n_jobs=1, random_state=0 + ).fit(X_train, y_train) + + y3 = ensemble.predict(X_test) + assert_array_almost_equal(y1, y3) + + +def test_gridsearch(): + # Check that bagging ensembles can be grid-searched. + # Transform iris into a binary classification task + X, y = iris.data, iris.target + y[y == 2] = 1 + + # Grid search with scoring based on decision_function + parameters = {"n_estimators": (1, 2), "base_estimator__C": (1, 2)} + + GridSearchCV(SerialBaggingClassifier(SVC()), parameters, scoring="roc_auc").fit( + X, y + ) + + +def test_base_estimator(): + # Check base_estimator and its default values. + rng = check_random_state(0) + + # Classification + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + + ensemble = SerialBaggingClassifier(None, n_jobs=3, random_state=0).fit( + X_train, y_train + ) + + assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier) + + ensemble = SerialBaggingClassifier( + DecisionTreeClassifier(), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + assert isinstance(ensemble.base_estimator_, DecisionTreeClassifier) + + ensemble = SerialBaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit( + X_train, y_train + ) + + assert isinstance(ensemble.base_estimator_, Perceptron) + + # Regression + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + ensemble = SerialBaggingRegressor(None, n_jobs=3, random_state=0).fit( + X_train, y_train + ) + + assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor) + + ensemble = SerialBaggingRegressor( + DecisionTreeRegressor(), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + assert isinstance(ensemble.base_estimator_, DecisionTreeRegressor) + + ensemble = SerialBaggingRegressor(SVR(), n_jobs=3, random_state=0).fit( + X_train, y_train + ) + assert isinstance(ensemble.base_estimator_, SVR) + + +def test_bagging_with_pipeline(): + estimator = SerialBaggingClassifier( + make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2 + ) + estimator.fit(iris.data, iris.target) + assert isinstance(estimator[0].steps[-1][1].random_state, int) + + +class DummyZeroEstimator(BaseEstimator): + def fit(self, X, y): + self.classes_ = np.unique(y) + return self + + def predict(self, X): + return self.classes_[np.zeros(X.shape[0], dtype=int)] + + +def test_bagging_sample_weight_unsupported_but_passed(): + estimator = SerialBaggingClassifier(DummyZeroEstimator()) + rng = check_random_state(0) + + estimator.fit(iris.data, iris.target).predict(iris.data) + assert_raises( + ValueError, + estimator.fit, + iris.data, + iris.target, + sample_weight=rng.randint(10, size=(iris.data.shape[0])), + ) + + +def test_warm_start(random_state=42): + # Test if fitting incrementally with warm start gives a forest of the + # right size and the same results as a normal fit. + X, y = make_hastie_10_2(n_samples=20, random_state=1) + + clf_ws = None + for n_estimators in [5, 10]: + if clf_ws is None: + clf_ws = SerialBaggingClassifier( + n_estimators=n_estimators, random_state=random_state, warm_start=True + ) + else: + clf_ws.set_params(n_estimators=n_estimators) + clf_ws.fit(X, y) + assert len(clf_ws) == n_estimators + + clf_no_ws = SerialBaggingClassifier( + n_estimators=10, random_state=random_state, warm_start=False + ) + clf_no_ws.fit(X, y) + + assert set([tree.random_state for tree in clf_ws]) == set( + [tree.random_state for tree in clf_no_ws] + ) + + +def test_warm_start_smaller_n_estimators(): + # Test if warm start'ed second fit with smaller n_estimators raises error. + X, y = make_hastie_10_2(n_samples=20, random_state=1) + clf = SerialBaggingClassifier(n_estimators=5, warm_start=True) + clf.fit(X, y) + clf.set_params(n_estimators=4) + assert_raises(ValueError, clf.fit, X, y) + + +def test_warm_start_equal_n_estimators(): + # Test that nothing happens when fitting without increasing n_estimators + X, y = make_hastie_10_2(n_samples=20, random_state=1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) + + clf = SerialBaggingClassifier(n_estimators=5, warm_start=True, random_state=83) + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + # modify X to nonsense values, this should not change anything + X_train += 1.0 + + assert_warns_message( + UserWarning, + "Warm-start fitting without increasing n_estimators does not", + clf.fit, + X_train, + y_train, + ) + assert_array_equal(y_pred, clf.predict(X_test)) + + +def test_warm_start_equivalence(): + # warm started classifier with 5+5 estimators should be equivalent to + # one classifier with 10 estimators + X, y = make_hastie_10_2(n_samples=20, random_state=1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) + + clf_ws = SerialBaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) + clf_ws.fit(X_train, y_train) + clf_ws.set_params(n_estimators=10) + clf_ws.fit(X_train, y_train) + y1 = clf_ws.predict(X_test) + + clf = SerialBaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) + clf.fit(X_train, y_train) + y2 = clf.predict(X_test) + + assert_array_almost_equal(y1, y2) + + +def test_warm_start_with_oob_score_fails(): + # Check using oob_score and warm_start simultaneously fails + X, y = make_hastie_10_2(n_samples=20, random_state=1) + clf = SerialBaggingClassifier(n_estimators=5, warm_start=True, oob_score=True) + assert_raises(ValueError, clf.fit, X, y) + + +def test_oob_score_removed_on_warm_start(): + X, y = make_hastie_10_2(n_samples=2000, random_state=1) + + clf = SerialBaggingClassifier(n_estimators=50, oob_score=True) + clf.fit(X, y) + + clf.set_params(warm_start=True, oob_score=False, n_estimators=100) + clf.fit(X, y) + + assert_raises(AttributeError, getattr, clf, "oob_score_") + + +def test_oob_score_consistency(): + # Make sure OOB scores are identical when random_state, estimator, and + # training data are fixed and fitting is done twice + X, y = make_hastie_10_2(n_samples=200, random_state=1) + bagging = SerialBaggingClassifier( + KNeighborsClassifier(), + max_samples=0.5, + max_features=0.5, + oob_score=True, + random_state=1, + ) + assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_ + + +def test_estimators_samples(): + # Check that format of estimators_samples_ is correct and that results + # generated at fit time can be identically reproduced at a later time + # using data saved in object attributes. + X, y = make_hastie_10_2(n_samples=200, random_state=1) + bagging = SerialBaggingClassifier( + LogisticRegression(), + max_samples=0.5, + max_features=0.5, + random_state=1, + bootstrap=False, + ) + bagging.fit(X, y) + + # Get relevant attributes + estimators_samples = bagging.estimators_samples_ + estimators_features = bagging.estimators_features_ + estimators = bagging.estimators_ + + # Test for correct formatting + assert len(estimators_samples) == len(estimators) + assert len(estimators_samples[0]) == len(X) // 2 + assert estimators_samples[0].dtype.kind == "i" + + # Re-fit single estimator to test for consistent sampling + estimator_index = 0 + estimator_samples = estimators_samples[estimator_index] + estimator_features = estimators_features[estimator_index] + estimator = estimators[estimator_index] + + X_train = (X[estimator_samples])[:, estimator_features] + y_train = y[estimator_samples] + + orig_coefs = estimator.coef_ + estimator.fit(X_train, y_train) + new_coefs = estimator.coef_ + + assert_array_almost_equal(orig_coefs, new_coefs) + + +def test_estimators_samples_deterministic(): + # This test is a regression test to check that with a random step + # (e.g. SparseRandomProjection) and a given random state, the results + # generated at fit time can be identically reproduced at a later time using + # data saved in object attributes. Check issue #9524 for full discussion. + + iris = load_iris() + X, y = iris.data, iris.target + + base_pipeline = make_pipeline( + SparseRandomProjection(n_components=2), LogisticRegression() + ) + clf = SerialBaggingClassifier( + base_estimator=base_pipeline, max_samples=0.5, random_state=0 + ) + clf.fit(X, y) + pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() + + estimator = clf.estimators_[0] + estimator_sample = clf.estimators_samples_[0] + estimator_feature = clf.estimators_features_[0] + + X_train = (X[estimator_sample])[:, estimator_feature] + y_train = y[estimator_sample] + + estimator.fit(X_train, y_train) + assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef) + + +def test_max_samples_consistency(): + # Make sure validated max_samples and original max_samples are identical + # when valid integer max_samples supplied by user + max_samples = 100 + X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) + bagging = SerialBaggingClassifier( + KNeighborsClassifier(), + max_samples=max_samples, + max_features=0.5, + random_state=1, + ) + bagging.fit(X, y) + assert bagging._max_samples == max_samples + + +def test_set_oob_score_label_encoding(): + # Make sure the oob_score doesn't change when the labels change + # See: https://github.com/scikit-learn/scikit-learn/issues/8933 + random_state = 5 + X = [[-1], [0], [1]] * 5 + Y1 = ["A", "B", "C"] * 5 + Y2 = [-1, 0, 1] * 5 + Y3 = [0, 1, 2] * 5 + x1 = ( + SerialBaggingClassifier(oob_score=True, random_state=random_state) + .fit(X, Y1) + .oob_score_ + ) + x2 = ( + SerialBaggingClassifier(oob_score=True, random_state=random_state) + .fit(X, Y2) + .oob_score_ + ) + x3 = ( + SerialBaggingClassifier(oob_score=True, random_state=random_state) + .fit(X, Y3) + .oob_score_ + ) + assert [x1, x2] == [x3, x3] + + +def replace(X): + X = X.astype("float", copy=True) + X[~np.isfinite(X)] = 0 + return X + + +def test_bagging_regressor_with_missing_inputs(): + # Check that SerialBaggingRegressor can accept X with missing/infinite data + X = np.array( + [[1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6]] + ) + y_values = [ + np.array([2, 3, 3, 3, 3]), + np.array([[2, 1, 9], [3, 6, 8], [3, 6, 8], [3, 6, 8], [3, 6, 8]]), + ] + for y in y_values: + regressor = DecisionTreeRegressor() + pipeline = make_pipeline(FunctionTransformer(replace), regressor) + pipeline.fit(X, y).predict(X) + bagging_regressor = SerialBaggingRegressor(pipeline) + y_hat = bagging_regressor.fit(X, y).predict(X) + assert y.shape == y_hat.shape + + # Verify that exceptions can be raised by wrapper regressor + regressor = DecisionTreeRegressor() + pipeline = make_pipeline(regressor) + assert_raises(ValueError, pipeline.fit, X, y) + bagging_regressor = SerialBaggingRegressor(pipeline) + assert_raises(ValueError, bagging_regressor.fit, X, y) + + +def test_bagging_classifier_with_missing_inputs(): + # Check that SerialBaggingClassifier can accept X with missing/infinite data + X = np.array( + [[1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6]] + ) + y = np.array([3, 6, 6, 6, 6]) + classifier = DecisionTreeClassifier() + pipeline = make_pipeline(FunctionTransformer(replace), classifier) + pipeline.fit(X, y).predict(X) + bagging_classifier = SerialBaggingClassifier(pipeline) + bagging_classifier.fit(X, y) + y_hat = bagging_classifier.predict(X) + assert y.shape == y_hat.shape + bagging_classifier.predict_log_proba(X) + bagging_classifier.predict_proba(X) + + # Verify that exceptions can be raised by wrapper classifier + classifier = DecisionTreeClassifier() + pipeline = make_pipeline(classifier) + assert_raises(ValueError, pipeline.fit, X, y) + bagging_classifier = SerialBaggingClassifier(pipeline) + assert_raises(ValueError, bagging_classifier.fit, X, y) + + +def test_bagging_small_max_features(): + # Check that Bagging estimator can accept low fractional max_features + + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + + bagging = SerialBaggingClassifier( + LogisticRegression(), max_features=0.3, random_state=1 + ) + bagging.fit(X, y) + + +def test_bagging_get_estimators_indices(): + # Check that Bagging estimator can generate sample indices properly + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16436 + + rng = np.random.RandomState(0) + X = rng.randn(13, 4) + y = np.arange(13) + + class MyEstimator(DecisionTreeRegressor): + """An estimator which stores y indices information at fit.""" + + def fit(self, X, y): + self._sample_indices = y + + clf = SerialBaggingRegressor( + base_estimator=MyEstimator(), n_estimators=1, random_state=0 + ) + clf.fit(X, y) + + assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0]) diff --git a/afqinsight/tests/test_pipelines.py b/afqinsight/tests/test_pipelines.py index 4c202ee9..6b1f6b39 100644 --- a/afqinsight/tests/test_pipelines.py +++ b/afqinsight/tests/test_pipelines.py @@ -3,6 +3,7 @@ from afqinsight import make_afq_classifier_pipeline, make_afq_regressor_pipeline from afqinsight.pipeline import make_base_afq_pipeline +from afqinsight._serial_bagging import SerialBaggingClassifier, SerialBaggingRegressor from sklearn.base import is_classifier from sklearn.ensemble import BaggingClassifier, BaggingRegressor from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor @@ -39,6 +40,7 @@ ensembler_args = [ ("bagging", {True: BaggingClassifier, False: BaggingRegressor}), ("adaboost", {True: AdaBoostClassifier, False: AdaBoostRegressor}), + ("serial-bagging", {True: SerialBaggingClassifier, False: SerialBaggingRegressor}), (AdaBoostClassifier, {True: AdaBoostClassifier, False: AdaBoostClassifier}), (None, None), ]