-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
estimators.py
1621 lines (1351 loc) · 65.2 KB
/
estimators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- encoding: utf-8 -*-
from __future__ import annotations
from typing import (
Any,
Dict,
Iterable,
List,
Mapping,
Optional,
Sequence,
Tuple,
Type,
Union,
)
import warnings
import dask.distributed
import joblib
import numpy as np
import pandas as pd
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
from scipy.sparse import spmatrix
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.utils.multiclass import type_of_target
from smac.runhistory.runhistory import RunInfo, RunValue
from typing_extensions import Literal
from autosklearn.automl import AutoML, AutoMLClassifier, AutoMLRegressor
from autosklearn.data.validation import (
SUPPORTED_FEAT_TYPES,
SUPPORTED_TARGET_TYPES,
convert_if_sparse,
)
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
from autosklearn.ensembles.ensemble_selection import EnsembleSelection
from autosklearn.ensembles.multiobjective_dummy_ensemble import (
MultiObjectiveDummyEnsemble,
)
from autosklearn.metrics import Scorer
from autosklearn.pipeline.base import BasePipeline
from autosklearn.util.smac_wrap import SMACCallback
class AutoSklearnEstimator(BaseEstimator):
def __init__(
self,
time_left_for_this_task=3600,
per_run_time_limit=None,
initial_configurations_via_metalearning=25,
ensemble_size: int | None = None,
ensemble_class: Type[AbstractEnsemble] | Literal["default"] | None = "default",
ensemble_kwargs: Dict[str, Any] | None = None,
ensemble_nbest=50,
max_models_on_disc=50,
seed=1,
memory_limit=3072,
include: Optional[Dict[str, List[str]]] = None,
exclude: Optional[Dict[str, List[str]]] = None,
resampling_strategy="holdout",
resampling_strategy_arguments=None,
tmp_folder=None,
delete_tmp_folder_after_terminate=True,
n_jobs: Optional[int] = None,
dask_client: Optional[dask.distributed.Client] = None,
disable_evaluator_output=False,
get_smac_object_callback=None,
smac_scenario_args=None,
logging_config=None,
metadata_directory=None,
metric: Scorer | Sequence[Scorer] | None = None,
scoring_functions: Optional[List[Scorer]] = None,
load_models: bool = True,
get_trials_callback: SMACCallback | None = None,
dataset_compression: Union[bool, Mapping[str, Any]] = True,
allow_string_features: bool = True,
disable_progress_bar: bool = False,
):
"""
Parameters
----------
time_left_for_this_task : int, optional (default=3600)
Time limit in seconds for the search of appropriate
models. By increasing this value, *auto-sklearn* has a higher
chance of finding better models.
per_run_time_limit : int, optional (default=1/10 of time_left_for_this_task)
Time limit for a single call to the machine learning model.
Model fitting will be terminated if the machine learning
algorithm runs over the time limit. Set this value high enough so
that typical machine learning algorithms can be fit on the
training data.
initial_configurations_via_metalearning : int, optional (default=25)
Initialize the hyperparameter optimization algorithm with this
many configurations which worked well on previously seen
datasets. Disable if the hyperparameter optimization algorithm
should start from scratch.
ensemble_size : int, optional
Number of models added to the ensemble built by *Ensemble
selection from libraries of models*. Models are drawn with
replacement. If set to ``0`` no ensemble is fit.
Deprecated - will be removed in Auto-sklearn 0.16. Please pass
this argument via ``ensemble_kwargs={"ensemble_size": int}``
if you want to change the ensemble size for ensemble selection.
ensemble_class : Type[AbstractEnsemble] | "default", optional (default="default")
Class implementing the post-hoc ensemble algorithm. Set to
``None`` to disable ensemble building or use :class:`SingleBest`
to obtain only use the single best model instead of an
ensemble.
If set to "default" it will use :class:`EnsembleSelection` for
single-objective problems and :class:`MultiObjectiveDummyEnsemble`
for multi-objective problems.
ensemble_kwargs : Dict, optional
Keyword arguments that are passed to the ensemble class upon
initialization.
ensemble_nbest : int, optional (default=50)
Only consider the ``ensemble_nbest`` models when building an
ensemble. This is inspired by a concept called library pruning
introduced in `Getting Most out of Ensemble Selection`. This
is independent of the ``ensemble_class`` argument and this
pruning step is done prior to constructing an ensemble.
max_models_on_disc: int, optional (default=50),
Defines the maximum number of models that are kept in the disc.
The additional number of models are permanently deleted. Due to the
nature of this variable, it sets the upper limit on how many models
can be used for an ensemble.
It must be an integer greater or equal than 1.
If set to None, all models are kept on the disc.
seed : int, optional (default=1)
Used to seed SMAC. Will determine the output file names.
memory_limit : int, optional (3072)
Memory limit in MB for the machine learning algorithm.
`auto-sklearn` will stop fitting the machine learning algorithm if
it tries to allocate more than ``memory_limit`` MB.
**Important notes:**
* If ``None`` is provided, no memory limit is set.
* In case of multi-processing, ``memory_limit`` will be *per job*, so the total usage is
``n_jobs x memory_limit``.
* The memory limit also applies to the ensemble creation process.
include : Optional[Dict[str, List[str]]] = None
If None, all possible algorithms are used.
Otherwise, specifies a step and the components that are included in search.
See ``/pipeline/components/<step>/*`` for available components.
Incompatible with parameter ``exclude``.
**Possible Steps**:
* ``"data_preprocessor"``
* ``"balancing"``
* ``"feature_preprocessor"``
* ``"classifier"`` - Only for when when using ``AutoSklearnClasssifier``
* ``"regressor"`` - Only for when when using ``AutoSklearnRegressor``
**Example**:
.. code-block:: python
include = {
'classifier': ["random_forest"],
'feature_preprocessor': ["no_preprocessing"]
}
exclude : Optional[Dict[str, List[str]]] = None
If None, all possible algorithms are used.
Otherwise, specifies a step and the components that are excluded from search.
See ``/pipeline/components/<step>/*`` for available components.
Incompatible with parameter ``include``.
**Possible Steps**:
* ``"data_preprocessor"``
* ``"balancing"``
* ``"feature_preprocessor"``
* ``"classifier"`` - Only for when when using ``AutoSklearnClasssifier``
* ``"regressor"`` - Only for when when using ``AutoSklearnRegressor``
**Example**:
.. code-block:: python
exclude = {
'classifier': ["random_forest"],
'feature_preprocessor': ["no_preprocessing"]
}
resampling_strategy : str | BaseCrossValidator | _RepeatedSplits | BaseShuffleSplit = "holdout"
How to to handle overfitting, might need to use ``resampling_strategy_arguments``
if using ``"cv"`` based method or a Splitter object.
* **Options**
* ``"holdout"`` - Use a 67:33 (train:test) split
* ``"cv"``: perform cross validation, requires "folds" in ``resampling_strategy_arguments``
* ``"holdout-iterative-fit"`` - Same as "holdout" but iterative fit where possible
* ``"cv-iterative-fit"``: Same as "cv" but iterative fit where possible
* ``"partial-cv"``: Same as "cv" but uses intensification.
* ``BaseCrossValidator`` - any BaseCrossValidator subclass (found in scikit-learn model_selection module)
* ``_RepeatedSplits`` - any _RepeatedSplits subclass (found in scikit-learn model_selection module)
* ``BaseShuffleSplit`` - any BaseShuffleSplit subclass (found in scikit-learn model_selection module)
If using a Splitter object that relies on the dataset retaining it's current
size and order, you will need to look at the ``dataset_compression`` argument
and ensure that ``"subsample"`` is not included in the applied compression
``"methods"`` or disable it entirely with ``False``.
resampling_strategy_arguments : Optional[Dict] = None
Additional arguments for ``resampling_strategy``, this is required if
using a ``cv`` based strategy. The default arguments if left as ``None``
are:
.. code-block:: python
{
"train_size": 0.67, # The size of the training set
"shuffle": True, # Whether to shuffle before splitting data
"folds": 5 # Used in 'cv' based resampling strategies
}
If using a custom splitter class, which takes ``n_splits`` such as
`PredefinedSplit <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn-model-selection-kfold>`_,
the value of ``"folds"`` will be used.
tmp_folder : string, optional (None)
folder to store configuration output and log files, if ``None``
automatically use ``/tmp/autosklearn_tmp_$pid_$random_number``
delete_tmp_folder_after_terminate: bool, optional (True)
remove tmp_folder, when finished. If tmp_folder is None
tmp_dir will always be deleted
n_jobs : int, optional, experimental
The number of jobs to run in parallel for ``fit()``. ``-1`` means
using all processors.
**Important notes**:
* By default, Auto-sklearn uses one core.
* Ensemble building is not affected by ``n_jobs`` but can be controlled by the number
of models in the ensemble.
* ``predict()`` is not affected by ``n_jobs`` (in contrast to most scikit-learn models)
* If ``dask_client`` is ``None``, a new dask client is created.
dask_client : dask.distributed.Client, optional
User-created dask client, can be used to start a dask cluster and then
attach auto-sklearn to it.
disable_evaluator_output: bool or list, optional (False)
If True, disable model and prediction output. Cannot be used
together with ensemble building. ``predict()`` cannot be used when
setting this True. Can also be used as a list to pass more
fine-grained information on what to save. Allowed elements in the
list are:
* ``'y_optimization'`` : do not save the predictions for the
optimization set, which would later on be used to build an ensemble.
* ``model`` : do not save any model files
smac_scenario_args : dict, optional (None)
Additional arguments inserted into the scenario of SMAC. See the
`SMAC documentation <https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario>`_
for a list of available arguments.
get_smac_object_callback : callable
Callback function to create an object of class
`smac.facade.AbstractFacade <https://automl.github.io/SMAC3/main/api/smac.facade.html>`_.
The function must accept the arguments ``scenario_dict``,
``instances``, ``num_params``, ``runhistory``, ``seed`` and ``ta``.
This is an advanced feature. Use only if you are familiar with
`SMAC <https://automl.github.io/SMAC3/main/index.html>`_.
logging_config : dict, optional (None)
dictionary object specifying the logger configuration. If None,
the default logging.yaml file is used, which can be found in
the directory ``util/logging.yaml`` relative to the installation.
metadata_directory : str, optional (None)
path to the metadata directory. If None, the default directory
(autosklearn.metalearning.files) is used.
metric : Scorer, optional (None)
An instance of :class:`autosklearn.metrics.Scorer` as created by
:meth:`autosklearn.metrics.make_scorer`. These are the `Built-in
Metrics`_.
If None is provided, a default metric is selected depending on the task.
scoring_functions : List[Scorer], optional (None)
List of scorers which will be calculated for each pipeline and results will be
available via ``cv_results``
load_models : bool, optional (True)
Whether to load the models after fitting Auto-sklearn.
get_trials_callback: callable
A callable with the following definition.
* (smac.SMBO, smac.RunInfo, smac.RunValue, time_left: float) -> bool | None
This will be called after SMAC, the underlying optimizer for autosklearn,
finishes training each run.
You can use this to record your own information about the optimization
process. You can also use this to enable a early stopping based on some
critera.
See the example:
:ref:`Early Stopping And Callbacks <sphx_glr_examples_40_advanced_example_early_stopping_and_callbacks.py>`.
dataset_compression: Union[bool, Mapping[str, Any]] = True
We compress datasets so that they fit into some predefined amount of memory.
Currently this does not apply to dataframes or sparse arrays, only to raw
numpy arrays.
**NOTE** - If using a custom ``resampling_strategy`` that relies on specific
size or ordering of data, this must be disabled to preserve these properties.
You can disable this entirely by passing ``False`` or leave as the default
``True`` for configuration below.
.. code-block:: python
{
"memory_allocation": 0.1,
"methods": ["precision", "subsample"]
}
You can also pass your own configuration with the same keys and choosing
from the available ``"methods"``.
The available options are described here:
* **memory_allocation**
By default, we attempt to fit the dataset into ``0.1 * memory_limit``.
This float value can be set with ``"memory_allocation": 0.1``.
We also allow for specifying absolute memory in MB, e.g. 10MB is
``"memory_allocation": 10``.
The memory used by the dataset is checked after each reduction method is
performed. If the dataset fits into the allocated memory, any further
methods listed in ``"methods"`` will not be performed.
For example, if ``methods: ["precision", "subsample"]`` and the
``"precision"`` reduction step was enough to make the dataset fit into
memory, then the ``"subsample"`` reduction step will not be performed.
* **methods**
We provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.
* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``
* ``subsample`` - We subsample data such that it **fits directly into
the memory allocation** ``memory_allocation * memory_limit``.
Therefore, this should likely be the last method listed in
``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each
label is included in the sampled set.
allow_string_features: bool = True
Whether autosklearn should process string features. By default the
textpreprocessing is enabled.
disable_progress_bar: bool = False
Whether to disable the progress bar that is displayed in the console
while fitting to the training data.
Attributes
----------
cv_results_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
Not all keys returned by scikit-learn are supported yet.
performance_over_time_ : pandas.core.frame.DataFrame
A ``DataFrame`` containing the models performance over time data. Can be
used for plotting directly. Please refer to the example
:ref:`Train and Test Inputs <sphx_glr_examples_40_advanced_example_pandas_train_test.py>`.
""" # noqa (links are too long)
# Raise error if the given total time budget is less than 30 seconds.
if time_left_for_this_task < 30:
raise ValueError("Time left for this task must be at least " "30 seconds. ")
self.time_left_for_this_task = time_left_for_this_task
self.per_run_time_limit = per_run_time_limit
self.initial_configurations_via_metalearning = (
initial_configurations_via_metalearning
)
# Need to resolve the ensemble class here so we can act on it below.
if ensemble_class == "default":
ensemble_class = self._resolve_ensemble_class(metric)
self.ensemble_class = ensemble_class
# User specified `ensemble_size` explicitly, warn them about deprecation
if ensemble_size is not None:
# Keep consistent behaviour
message = (
"`ensemble_size` has been deprecated, please use `ensemble_kwargs = "
"{'ensemble_size': %d}`. Inserting `ensemble_size` into "
"`ensemble_kwargs` for now. `ensemble_size` will be removed in "
"auto-sklearn 0.16."
) % ensemble_size
if ensemble_class == EnsembleSelection:
if ensemble_kwargs is None:
ensemble_kwargs = {"ensemble_size": ensemble_size}
warnings.warn(message, DeprecationWarning, stacklevel=2)
elif "ensemble_size" not in ensemble_kwargs:
ensemble_kwargs["ensemble_size"] = ensemble_size
warnings.warn(message, DeprecationWarning, stacklevel=2)
else:
warnings.warn(
"Deprecated argument `ensemble_size` is both provided "
"as an argument to the constructor and passed inside "
"`ensemble_kwargs`. Will ignore the argument and use "
"the value given in `ensemble_kwargs` (%d). `ensemble_size` "
"will be removed in auto-sklearn 0.16."
% ensemble_kwargs["ensemble_size"],
DeprecationWarning,
stacklevel=2,
)
else:
warnings.warn(
"`ensemble_size` has been deprecated, please use "
"`ensemble_kwargs = {'ensemble_size': %d} if this "
"was intended. Ignoring `ensemble_size` because "
"`ensemble_class` != EnsembleSelection. "
"`ensemble_size` will be removed in auto-sklearn 0.16."
% ensemble_size,
DeprecationWarning,
stacklevel=2,
)
self.ensemble_size = (
ensemble_size # Otherwise sklean.base.get_params() will complain
)
self.ensemble_kwargs = ensemble_kwargs
self.ensemble_nbest = ensemble_nbest
self.max_models_on_disc = max_models_on_disc
self.seed = seed
self.memory_limit = memory_limit
self.include = include
self.exclude = exclude
self.resampling_strategy = resampling_strategy
self.resampling_strategy_arguments = resampling_strategy_arguments
self.tmp_folder = tmp_folder
self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
self.n_jobs = n_jobs
self.dask_client = dask_client
self.disable_evaluator_output = disable_evaluator_output
self.get_smac_object_callback = get_smac_object_callback
self.smac_scenario_args = smac_scenario_args
self.logging_config = logging_config
self.metadata_directory = metadata_directory
self.metric = metric
self.scoring_functions = scoring_functions
self.load_models = load_models
self.get_trials_callback = get_trials_callback
self.dataset_compression = dataset_compression
self.allow_string_features = allow_string_features
self.disable_progress_bar = disable_progress_bar
self.automl_ = None # type: Optional[AutoML]
# Handle the number of jobs and the time for them
self._n_jobs = None
if self.n_jobs is None or self.n_jobs == 1:
self._n_jobs = 1
elif self.n_jobs == -1:
self._n_jobs = joblib.cpu_count()
else:
self._n_jobs = self.n_jobs
super().__init__()
def __getstate__(self):
# Cannot serialize a client!
self.dask_client = None
return self.__dict__
def build_automl(self):
initial_configs = self.initial_configurations_via_metalearning
automl = self._get_automl_class()(
temporary_directory=self.tmp_folder,
delete_tmp_folder_after_terminate=self.delete_tmp_folder_after_terminate,
time_left_for_this_task=self.time_left_for_this_task,
per_run_time_limit=self.per_run_time_limit,
initial_configurations_via_metalearning=initial_configs,
ensemble_class=self.ensemble_class,
ensemble_kwargs=self.ensemble_kwargs,
ensemble_nbest=self.ensemble_nbest,
max_models_on_disc=self.max_models_on_disc,
seed=self.seed,
memory_limit=self.memory_limit,
include=self.include,
exclude=self.exclude,
resampling_strategy=self.resampling_strategy,
resampling_strategy_arguments=self.resampling_strategy_arguments,
n_jobs=self._n_jobs,
dask_client=self.dask_client,
get_smac_object_callback=self.get_smac_object_callback,
disable_evaluator_output=self.disable_evaluator_output,
smac_scenario_args=self.smac_scenario_args,
logging_config=self.logging_config,
metadata_directory=self.metadata_directory,
metrics=[self.metric] if isinstance(self.metric, Scorer) else self.metric,
scoring_functions=self.scoring_functions,
get_trials_callback=self.get_trials_callback,
dataset_compression=self.dataset_compression,
allow_string_features=self.allow_string_features,
disable_progress_bar=self.disable_progress_bar,
)
return automl
def fit(self, **kwargs):
# Automatically set the cutoff time per task
if self.per_run_time_limit is None:
self.per_run_time_limit = self._n_jobs * self.time_left_for_this_task // 10
if self.automl_ is None:
self.automl_ = self.build_automl()
self.automl_.fit(load_models=self.load_models, **kwargs)
return self
def fit_pipeline(
self,
X: SUPPORTED_FEAT_TYPES,
y: Union[SUPPORTED_TARGET_TYPES, spmatrix],
config: Union[Configuration, Dict[str, Union[str, float, int]]],
dataset_name: Optional[str] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[Union[SUPPORTED_TARGET_TYPES, spmatrix]] = None,
feat_type: Optional[List[str]] = None,
*args,
**kwargs: Dict,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
"""Fits and individual pipeline configuration and returns
the result to the user.
The Estimator constraints are honored, for example the resampling
strategy, or memory constraints, unless directly provided to the method.
By default, this method supports the same signature as fit(), and any extra
arguments are redirected to the TAE evaluation function, which allows for
further customization while building a pipeline.
Any additional argument provided is directly passed to the
worker exercising the run.
Parameters
----------
X: array-like, shape = (n_samples, n_features)
The features used for training
y: array-like
The labels used for training
X_test: Optionalarray-like, shape = (n_samples, n_features)
If provided, the testing performance will be tracked on this features.
y_test: array-like
If provided, the testing performance will be tracked on this labels
config: Union[Configuration, Dict[str, Union[str, float, int]]]
A configuration object used to define the pipeline steps.
If a dict is passed, a configuration is created based on this dict.
dataset_name: Optional[str]
Name that will be used to tag the Auto-Sklearn run and identify the
Auto-Sklearn run
feat_type : list, optional (default=None)
List of str of `len(X.shape[1])` describing the attribute type.
Possible types are `Categorical` and `Numerical`. `Categorical`
attributes will be automatically One-Hot encoded. The values
used for a categorical attribute must be integers, obtained for
example by `sklearn.preprocessing.LabelEncoder
<https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.
Returns
-------
pipeline: Optional[BasePipeline]
The fitted pipeline. In case of failure while fitting the pipeline,
a None is returned.
run_info: RunInFo
A named tuple that contains the configuration launched
run_value: RunValue
A named tuple that contains the result of the run
"""
if self.automl_ is None:
self.automl_ = self.build_automl()
return self.automl_.fit_pipeline(
X=X,
y=y,
dataset_name=dataset_name,
config=config,
feat_type=feat_type,
X_test=X_test,
y_test=y_test,
*args,
**kwargs,
)
def fit_ensemble(
self,
y,
task: int = None,
precision: Literal[16, 21, 64] = 32,
dataset_name: Optional[str] = None,
ensemble_size: int | None = None,
ensemble_kwargs: Optional[Dict[str, Any]] = None,
ensemble_nbest: Optional[int] = None,
ensemble_class: Type[AbstractEnsemble] | Literal["default"] | None = "default",
metric: Scorer | Sequence[Scorer] | None = None,
):
"""Fit an ensemble to models trained during an optimization process.
All parameters are ``None`` by default. If no other value is given,
the default values which were set in a call to ``fit()`` are used.
Parameters
----------
y : array-like
Target values.
task : int
A constant from the module ``autosklearn.constants``. Determines
the task type (binary classification, multiclass classification,
multilabel classification or regression).
precision : int
Numeric precision used when loading ensemble data. Can be either
``16``, ``32`` or ``64``.
dataset_name : str
Name of the current data set.
ensemble_size : int, optional
Number of models added to the ensemble built by *Ensemble
selection from libraries of models*. Models are drawn with
replacement. If set to ``0`` no ensemble is fit.
Deprecated - will be removed in Auto-sklearn 0.16. Please pass
this argument via ``ensemble_kwargs={"ensemble_size": int}``
if you want to change the ensemble size for ensemble selection.
ensemble_kwargs : Dict, optional
Keyword arguments that are passed to the ensemble class upon
initialization.
ensemble_nbest : int
Only consider the ``ensemble_nbest`` models when building an
ensemble. This is inspired by a concept called library pruning
introduced in `Getting Most out of Ensemble Selection`. This
is independent of the ``ensemble_class`` argument and this
pruning step is done prior to constructing an ensemble.
ensemble_class : Type[AbstractEnsemble] | "default", optional (default="default")
Class implementing the post-hoc ensemble algorithm. Set to
``None`` to disable ensemble building or use class:`SingleBest`
to obtain only use the single best model instead of an
ensemble.
If set to "default" it will use :class:`EnsembleSelection` for
single-objective problems and :class:`MultiObjectiveDummyEnsemble`
for multi-objective problems.
metric: Scorer | Sequence[Scorer] | None = None
A metric or list of metrics to score the ensemble with
Returns
-------
self
""" # noqa: E501
if ensemble_class == "default":
# Things are actually a little more nuanced here:
# * If they passed `metric=None` at init, we would infer this in automl
# during `fit` and store it in the automl instance.
# * If they passed a `metric` in init and left it `None` here, this would
# also be in the automl instance
# => We can use self.automl_ as ground truth for metric if no metrics passed
# and we have one created
if metric is None and self.automl_ is not None and self.automl_._metrics:
metric = self.automl_._metrics
ensemble_class = self._resolve_ensemble_class(metric)
self.ensemble_class = ensemble_class
# User specified `ensemble_size` explicitly, warn them about deprecation
if ensemble_size is not None:
# Keep consistent behaviour
message = (
"`ensemble_size` has been deprecated, please use `ensemble_kwargs = "
"{'ensemble_size': %d}`. Inserting `ensemble_size` into "
"`ensemble_kwargs` for now. `ensemble_size` will be removed in "
"auto-sklearn 0.16."
) % ensemble_size
if ensemble_class == EnsembleSelection:
if ensemble_kwargs is None:
ensemble_kwargs = {"ensemble_size": ensemble_size}
warnings.warn(message, DeprecationWarning, stacklevel=2)
elif "ensemble_size" not in ensemble_kwargs:
ensemble_kwargs["ensemble_size"] = ensemble_size
warnings.warn(message, DeprecationWarning, stacklevel=2)
else:
warnings.warn(
"Deprecated argument `ensemble_size` is both provided "
"as an argument to the constructor and passed inside "
"`ensemble_kwargs`. Will ignore the argument and use "
"the value given in `ensemble_kwargs` (%d). `ensemble_size` "
"will be removed in auto-sklearn 0.16."
% ensemble_kwargs["ensemble_size"],
DeprecationWarning,
stacklevel=2,
)
else:
warnings.warn(
"`ensemble_size` has been deprecated, please use "
"`ensemble_kwargs = {'ensemble_size': %d} if this "
"was intended. Ignoring `ensemble_size` because "
"`ensemble_class` != EnsembleSelection. "
"`ensemble_size` will be removed in auto-sklearn 0.16."
% ensemble_size,
DeprecationWarning,
stacklevel=2,
)
if self.automl_ is None:
# Build a dummy automl object to call fit_ensemble
# The ensemble size is honored in the .automl_.fit_ensemble
# call
self.automl_ = self.build_automl()
self.automl_.fit_ensemble(
y=y,
task=task,
precision=precision,
dataset_name=dataset_name,
ensemble_nbest=ensemble_nbest,
ensemble_class=ensemble_class,
ensemble_kwargs=ensemble_kwargs,
metrics=metric,
)
return self
def _resolve_ensemble_class(
self,
metric: Scorer | Sequence[Scorer] | None,
) -> type[AbstractEnsemble]:
return (
EnsembleSelection
if metric is None or isinstance(metric, Scorer) or len(metric) == 1
else MultiObjectiveDummyEnsemble
)
def refit(self, X, y):
"""Refit all models found with fit to new data.
Necessary when using cross-validation. During training, auto-sklearn
fits each model k times on the dataset, but does not keep any trained
model and can therefore not be used to predict for new data points.
This methods fits all models found during a call to fit on the data
given. This method may also be used together with holdout to avoid
only using 66% of the training data to fit the final model.
Parameters
----------
X : array-like or sparse matrix of shape = [n_samples, n_features]
The training input samples.
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
The targets.
Returns
-------
self
"""
self.automl_.refit(X, y)
return self
def predict(self, X, batch_size=None, n_jobs=1):
return self.automl_.predict(X, batch_size=batch_size, n_jobs=n_jobs)
def predict_proba(self, X, batch_size=None, n_jobs=1):
return self.automl_.predict_proba(X, batch_size=batch_size, n_jobs=n_jobs)
def score(self, X, y):
return self.automl_.score(X, y)
def show_models(self):
"""Returns a dictionary containing dictionaries of ensemble models.
Each model in the ensemble can be accessed by giving its ``model_id`` as key.
A model dictionary contains the following:
* ``"model_id"`` - The id given to a model by ``autosklearn``.
* ``"rank"`` - The rank of the model based on it's ``"cost"``.
* ``"cost"`` - The loss of the model on the validation set.
* ``"ensemble_weight"`` - The weight given to the model in the ensemble.
* ``"voting_model"`` - The ``cv_voting_ensemble`` model (for 'cv' resampling).
* ``"estimators"`` - List of models (dicts) in ``cv_voting_ensemble``
('cv' resampling).
* ``"data_preprocessor"`` - The preprocessor used on the data.
* ``"balancing"`` - The balancing used on the data (for classification).
* ``"feature_preprocessor"`` - The preprocessor for features types.
* ``"classifier"`` / ``"regressor"``
- The autosklearn wrapped classifier or regressor.
* ``"sklearn_classifier"`` or ``"sklearn_regressor"``
- The sklearn classifier or regressor.
**Example**
.. code-block:: python
import sklearn.datasets
import sklearn.metrics
import autosklearn.regression
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
automl = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=120
)
automl.fit(X_train, y_train, dataset_name='diabetes')
ensemble_dict = automl.show_models()
print(ensemble_dict)
Output:
.. code-block:: text
{
25: {'model_id': 25.0,
'rank': 1,
'cost': 0.43667876507897496,
'ensemble_weight': 0.38,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
'feature_preprocessor': <autosklearn.pipeline.components....>,
'regressor': <autosklearn.pipeline.components.regression....>,
'sklearn_regressor': SGDRegressor(alpha=0.0006517033225329654,...)
},
6: {'model_id': 6.0,
'rank': 2,
'cost': 0.4550418898836528,
'ensemble_weight': 0.3,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
'feature_preprocessor': <autosklearn.pipeline.components....>,
'regressor': <autosklearn.pipeline.components.regression....>,
'sklearn_regressor': ARDRegression(alpha_1=0.0003701926442639788,...)
}...
}
Returns
-------
Dict(int, Any) : dictionary of length = number of models in the ensemble
A dictionary of models in the ensemble, where ``model_id`` is the key.
""" # noqa: E501
return self.automl_.show_models()
def get_models_with_weights(self):
"""Return a list of the final ensemble found by auto-sklearn.
Returns
-------
[(weight_1, model_1), ..., (weight_n, model_n)]
"""
return self.automl_.get_models_with_weights()
@property
def performance_over_time_(self):
return self.automl_.performance_over_time_
@property
def cv_results_(self):
return self.automl_.cv_results_
@property
def trajectory_(self):
return self.automl_.trajectory_
@property
def fANOVA_input_(self):
return self.automl_.fANOVA_input_
def sprint_statistics(self):
"""Return the following statistics of the training result:
- dataset name
- metric used
- best validation score
- number of target algorithm runs
- number of successful target algorithm runs
- number of crashed target algorithm runs
- number of target algorithm runs that exceeded the memory limit
- number of target algorithm runs that exceeded the time limit
Returns
-------
str
"""
return self.automl_.sprint_statistics()
def leaderboard(
self,
detailed: bool = False,
ensemble_only: bool = True,
top_k: Union[int, Literal["all"]] = "all",
sort_by: str = "cost",
sort_order: Literal["auto", "ascending", "descending"] = "auto",
include: Optional[Union[str, Iterable[str]]] = None,
) -> pd.DataFrame:
"""Returns a pandas table of results for all evaluated models.
Gives an overview of all models trained during the search process along
with various statistics about their training.
The available statistics are:
**Simple**:
* ``"model_id"`` - The id given to a model by ``autosklearn``.
* ``"rank"`` - The rank of the model based on it's ``"cost"``.
* ``"ensemble_weight"`` - The weight given to the model in the ensemble.
* ``"type"`` - The type of classifier/regressor used.
* ``"cost"`` - The loss of the model on the validation set.
* ``"duration"`` - Length of time the model was optimized for.
**Detailed**:
The detailed view includes all of the simple statistics along with the
following.
* ``"config_id"`` - The id used by SMAC for optimization.
* ``"budget"`` - How much budget was allocated to this model.
* ``"status"`` - The return status of training the model with SMAC.
* ``"train_loss"`` - The loss of the model on the training set.
* ``"balancing_strategy"`` - The balancing strategy used for data preprocessing.
* ``"start_time"`` - Time the model began being optimized
* ``"end_time"`` - Time the model ended being optimized
* ``"data_preprocessors"`` - The preprocessors used on the data
* ``"feature_preprocessors"`` - The preprocessors for features types
Parameters
----------
detailed: bool = False
Whether to give detailed information or just a simple overview.
ensemble_only: bool = True
Whether to view only models included in the ensemble or all models
trained.
top_k: int or "all" = "all"
How many models to display.
sort_by: str = 'cost'
What column to sort by. If that column is not present, the
sorting defaults to the ``"model_id"`` index column.
Defaults to the metric optimized. Sort by the first objective
in case of a multi-objective optimization problem
sort_order: "auto" or "ascending" or "descending" = "auto"
Which sort order to apply to the ``sort_by`` column. If left