diff --git a/docs/guides/scheduling.md b/docs/guides/scheduling.md index 24db2194..52890940 100644 --- a/docs/guides/scheduling.md +++ b/docs/guides/scheduling.md @@ -424,6 +424,8 @@ However, there are more explicit methods. ```python exec="true" source="material-block" html="True" hl_lines="20" import time + from asyncio import Future + from amltk.scheduling import Scheduler scheduler = Scheduler.with_processes(1) @@ -437,7 +439,7 @@ However, there are more explicit methods. def submit_calculations() -> None: scheduler.submit(expensive_function) - # The will endlessly loop the scheduler + # This will endlessly loop the scheduler @scheduler.on_future_done def submit_again(future: Future) -> None: if scheduler.running(): @@ -468,6 +470,7 @@ the default, but it also takes three other possibilities: One example is to just `stop()` the scheduler when some exception occurs. ```python exec="true" source="material-block" html="True" hl_lines="12-15" +from asyncio import Future from amltk.scheduling import Scheduler scheduler = Scheduler.with_processes(1) diff --git a/docs/hooks/cleanup_log_output.py b/docs/hooks/cleanup_log_output.py index 49ee48c2..98c65421 100644 --- a/docs/hooks/cleanup_log_output.py +++ b/docs/hooks/cleanup_log_output.py @@ -13,7 +13,7 @@ import mkdocs.plugins import mkdocs.structure.pages -from amltk.exceptions import AutomaticParameterWarning, TaskTypeWarning +from amltk.exceptions import AutomaticParameterWarning log = logging.getLogger("mkdocs") diff --git a/docs/hooks/debug_which_page_is_being_rendered.py b/docs/hooks/debug_which_page_is_being_rendered.py index 69bf86f7..6ad00827 100644 --- a/docs/hooks/debug_which_page_is_being_rendered.py +++ b/docs/hooks/debug_which_page_is_being_rendered.py @@ -8,21 +8,12 @@ import mkdocs import mkdocs.plugins import mkdocs.structure.pages -import os log = logging.getLogger("mkdocs") -RENDER_EXAMPLES_ENV_VAR = "AMLTK_DOC_RENDER_EXAMPLES" -EXEC_DOCS_ENV_VAR = "AMLTK_EXEC_DOCS" - -truthy_values = {"yes", "on", "true", "1", "all"} - def on_pre_page( page: mkdocs.structure.pages.Page, config: Any, files: Any, ) -> mkdocs.structure.pages.Page | None: - render_examples = os.environ.get(RENDER_EXAMPLES_ENV_VAR, "true") - render_code = os.environ.get(EXEC_DOCS_ENV_VAR, "true") - if render_examples.lower() in truthy_values or render_code.lower() in truthy_values: - log.info(f"{page.file.src_path}") + log.info(f"{page.file.src_path}") diff --git a/docs/hooks/disable_markdown_exec.py b/docs/hooks/disable_markdown_exec.py index 9962b9cd..8df35d76 100644 --- a/docs/hooks/disable_markdown_exec.py +++ b/docs/hooks/disable_markdown_exec.py @@ -22,7 +22,7 @@ logger = logging.getLogger("mkdocs") -def _print_msg(compiled_code: Any, exec_globals: dict) -> None: +def _print_msg(compiled_code: Any, code_block_id: int, exec_globals: dict) -> None: _print = exec_globals["print"] _print( f"Env variable {RUN_CODE_BLOCKS_ENV_VAR}=0 - No code to display." @@ -42,5 +42,5 @@ def on_startup(**kwargs: Any): ) from markdown_exec.formatters import python - setattr(python, "exec", _print_msg) + setattr(python, "exec_python", _print_msg) diff --git a/docs/reference/metalearning/index.md b/docs/reference/metalearning/index.md index 5592cb96..066918f6 100644 --- a/docs/reference/metalearning/index.md +++ b/docs/reference/metalearning/index.md @@ -11,18 +11,344 @@ to help implement these methods. ## MetaFeatures -::: amltk.metalearning.metafeatures - options: - members: false +A [`MetaFeature`][amltk.metalearning.MetaFeature] is some +statistic about a dataset/task, that can be used to make datasets or +tasks more comparable, thus enabling meta-learning methods. + +Calculating meta-features of a dataset is quite straight foward. + +```python exec="true" source="material-block" result="python" title="Metafeatures" hl_lines="10" +import openml +from amltk.metalearning import compute_metafeatures + +dataset = openml.datasets.get_dataset( + 31, # credit-g + download_data=True, + download_features_meta_data=False, + download_qualities=False, +) +X, y, _, _ = dataset.get_data( + dataset_format="dataframe", + target=dataset.default_target_attribute, +) + +mfs = compute_metafeatures(X, y) + +print(mfs) +``` + +By default [`compute_metafeatures()`][amltk.metalearning.compute_metafeatures] will +calculate all the [`MetaFeature`][amltk.metalearning.MetaFeature] implemented, +iterating through their subclasses to do so. You can pass an explicit list +as well to `compute_metafeatures(X, y, features=[...])`. + +To implement your own is also quite straight forward: + +```python exec="true" source="material-block" result="python" title="Create Metafeature" hl_lines="10 11 12 13 14 15 16 17 18 19" +from amltk.metalearning import MetaFeature, compute_metafeatures +import openml +import pandas as pd + +dataset = openml.datasets.get_dataset( + 31, # credit-g + download_data=True, + download_features_meta_data=False, + download_qualities=False, +) +X, y, _, _ = dataset.get_data( + dataset_format="dataframe", + target=dataset.default_target_attribute, +) + +class TotalValues(MetaFeature): + + @classmethod + def compute( + cls, + x: pd.DataFrame, + y: pd.Series | pd.DataFrame, + dependancy_values: dict, + ) -> int: + return int(x.shape[0] * x.shape[1]) + +mfs = compute_metafeatures(X, y, features=[TotalValues]) +print(mfs) +``` + +As many metafeatures rely on pre-computed dataset statistics, and they do not +need to be calculated more than once, you can specify the dependancies of +a meta feature. When a metafeature would return something other than a single +value, i.e. a `dict` or a `pd.DataFrame`, we instead call those a +[`DatasetStatistic`][amltk.metalearning.DatasetStatistic]. These will +**not** be included in the result of [`compute_metafeatures()`][amltk.metalearning.compute_metafeatures]. +These `DatasetStatistic`s will only be calculated once on a call to `compute_metafeatures()` so +they can be re-used across all `MetaFeature`s that require that dependancy. + +```python exec="true" source="material-block" result="python" title="Metafeature Dependancy" hl_lines="10 11 12 13 14 15 16 17 18 19 20 23 26 35" +from amltk.metalearning import MetaFeature, DatasetStatistic, compute_metafeatures +import openml +import pandas as pd + +dataset = openml.datasets.get_dataset( + 31, # credit-g + download_data=True, + download_features_meta_data=False, + download_qualities=False, +) +X, y, _, _ = dataset.get_data( + dataset_format="dataframe", + target=dataset.default_target_attribute, +) + +class NAValues(DatasetStatistic): + """A mask of all NA values in a dataset""" + + @classmethod + def compute( + cls, + x: pd.DataFrame, + y: pd.Series | pd.DataFrame, + dependancy_values: dict, + ) -> pd.DataFrame: + return x.isna() + + +class PercentageNA(MetaFeature): + """The percentage of values missing""" + + dependencies = (NAValues,) + + @classmethod + def compute( + cls, + x: pd.DataFrame, + y: pd.Series | pd.DataFrame, + dependancy_values: dict, + ) -> int: + na_values = dependancy_values[NAValues] + n_na = na_values.sum().sum() + n_values = int(x.shape[0] * x.shape[1]) + return float(n_na / n_values) + +mfs = compute_metafeatures(X, y, features=[PercentageNA]) +print(mfs) +``` + +To view the description of a particular `MetaFeature`, you can call +[`.description()`][amltk.metalearning.DatasetStatistic.description] +on it. Otherwise you can access all of them in the following way: + +```python exec="true" source="tabbed-left" result="python" title="Metafeature Descriptions" hl_lines="4" +from pprint import pprint +from amltk.metalearning import metafeature_descriptions + +descriptions = metafeature_descriptions() +for name, description in descriptions.items(): + print("---") + print(name) + print("---") + print(" * " + description) +``` ## Dataset Distances +One common way to define how similar two datasets are is to compute some "similarity" +between them. This notion of "similarity" requires computing some features of a dataset +(**metafeatures**) first, such that we can numerically compute some distance function. + +Let's see how we can quickly compute the distance between some datasets with +[`dataset_distance()`][amltk.metalearning.dataset_distance]! + +```python exec="true" source="material-block" result="python" title="Dataset Distances P.1" session='dd' +import pandas as pd +import openml + +from amltk.metalearning import compute_metafeatures + +def get_dataset(dataset_id: int) -> tuple[pd.DataFrame, pd.Series]: + dataset = openml.datasets.get_dataset( + dataset_id, + download_data=True, + download_features_meta_data=False, + download_qualities=False, + ) + X, y, _, _ = dataset.get_data( + dataset_format="dataframe", + target=dataset.default_target_attribute, + ) + return X, y + +d31 = get_dataset(31) +d3 = get_dataset(3) +d4 = get_dataset(4) + +metafeatures_dict = { + "dataset_31": compute_metafeatures(*d31), + "dataset_3": compute_metafeatures(*d3), + "dataset_4": compute_metafeatures(*d4), +} -::: amltk.metalearning.dataset_distances - options: - members: false +metafeatures = pd.DataFrame(metafeatures_dict) +print(metafeatures) +``` + +Now we want to know which one of `#!python "dataset_3"` or `#!python "dataset_4"` is +more _similar_ to `#!python "dataset_31"`. + +```python exec="true" source="material-block" result="python" title="Dataset Distances P.2" session='dd' +from amltk.metalearning import dataset_distance + +target = metafeatures_dict.pop("dataset_31") +others = metafeatures_dict + +distances = dataset_distance(target, others, distance_metric="l2") +print(distances) +``` + +Seems like `#!python "dataset_3"` is some notion of closer to `#!python "dataset_31"` +than `#!python "dataset_4"`. However the scale of the metafeatures are not exactly all close. +For example, many lie between `#!python (0, 1)` but some like `instance_count` can completely +dominate the show. + +Lets repeat the computation but specify that we should apply a `#!python "minmax"` scaling +across the rows. + +```python exec="true" source="material-block" result="python" title="Dataset Distances P.3" session='dd' hl_lines="5" +distances = dataset_distance( + target, + others, + distance_metric="l2", + scaler="minmax" +) +print(distances) +``` + +Now `#!python "dataset_3"` is considered more similar but the difference between the two is a lot less +dramatic. In general, applying some scaling to values of different scales is required for metalearning. + +You can also use an [sklearn.preprocessing.MinMaxScaler][] or anything other scaler from scikit-learn +for that matter. + +```python exec="true" source="material-block" result="python" title="Dataset Distances P.3" session='dd' hl_lines="7" +from sklearn.preprocessing import MinMaxScaler + +distances = dataset_distance( + target, + others, + distance_metric="l2", + scaler=MinMaxScaler() +) +print(distances) +``` ## Portfolio Selection +A portfolio in meta-learning is to a set (ordered or not) of configurations +that maximize some notion of coverage across datasets or tasks. +The intuition here is that this also means that any new dataset is also covered! + +Suppose we have the given performances of some configurations across some datasets. +```python exec="true" source="material-block" result="python" title="Initial Portfolio" +import pandas as pd + +performances = { + "c1": [90, 60, 20, 10], + "c2": [20, 10, 90, 20], + "c3": [10, 20, 40, 90], + "c4": [90, 10, 10, 10], +} +portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) +print(portfolio) +``` + +If we could only choose `#!python k=3` of these configurations on some new given dataset, which ones would +you choose and in what priority? +Here is where we can apply [`portfolio_selection()`][amltk.metalearning.portfolio_selection]! + +The idea is that we pick a subset of these algorithms that maximise some value of utility for +the portfolio. We do this by adding a single configuration from the entire set, 1-by-1 until +we reach `k`, beginning with the empty portfolio. + +Let's see this in action! + +```python exec="true" source="material-block" result="python" title="Portfolio Selection" hl_lines="12 13 14 15 16" +import pandas as pd +from amltk.metalearning import portfolio_selection + +performances = { + "c1": [90, 60, 20, 10], + "c2": [20, 10, 90, 20], + "c3": [10, 20, 40, 90], + "c4": [90, 10, 10, 10], +} +portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) + +selected_portfolio, trajectory = portfolio_selection( + portfolio, + k=3, + scaler="minmax" +) + +print(selected_portfolio) +print() +print(trajectory) +``` + +The trajectory tells us which configuration was added at each time stamp along with the utility +of the portfolio with that configuration added. However we havn't specified how _exactly_ we defined the +utility of a given portfolio. We could define our own function to do so: + +```python exec="true" source="material-block" result="python" title="Portfolio Selection Custom" hl_lines="12 13 14 20" +import pandas as pd +from amltk.metalearning import portfolio_selection + +performances = { + "c1": [90, 60, 20, 10], + "c2": [20, 10, 90, 20], + "c3": [10, 20, 40, 90], + "c4": [90, 10, 10, 10], +} +portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) + +def my_function(p: pd.DataFrame) -> float: + # Take the maximum score for each dataset and then take the mean across them. + return p.max(axis=1).mean() + +selected_portfolio, trajectory = portfolio_selection( + portfolio, + k=3, + scaler="minmax", + portfolio_value=my_function, +) + +print(selected_portfolio) +print() +print(trajectory) +``` + +This notion of reducing across all configurations for a dataset and then aggregating these is common +enough that we can also directly just define these operations and we will perform the rest. + +```python exec="true" source="material-block" result="python" title="Portfolio Selection With Reduction" hl_lines="17 18" +import pandas as pd +import numpy as np +from amltk.metalearning import portfolio_selection + +performances = { + "c1": [90, 60, 20, 10], + "c2": [20, 10, 90, 20], + "c3": [10, 20, 40, 90], + "c4": [90, 10, 10, 10], +} +portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) + +selected_portfolio, trajectory = portfolio_selection( + portfolio, + k=3, + scaler="minmax", + row_reducer=np.max, # This is actually the default + aggregator=np.mean, # This is actually the default +) -::: amltk.metalearning.portfolio - options: - members: false +print(selected_portfolio) +print() +print(trajectory) +``` diff --git a/docs/reference/optimization/metrics.md b/docs/reference/optimization/metrics.md index b6c9b379..0b5211f3 100644 --- a/docs/reference/optimization/metrics.md +++ b/docs/reference/optimization/metrics.md @@ -1,5 +1,31 @@ ## Metric +A [`Metric`][amltk.optimization.Metric] to let optimizers know how to +handle numeric values properly. -::: amltk.optimization.metric - options: - members: False +A `Metric` is defined by a `.name: str` and whether it is better to `.minimize: bool` +the metric. Further, you can specify `.bounds: tuple[lower, upper]` which can +help optimizers and other code know how to treat metrics. + +To easily convert between `loss` and +`score` of some value you can use the [`loss()`][amltk.optimization.Metric.loss] +and [`score()`][amltk.optimization.Metric.score] methods. + +If the metric is bounded, you can also make use of the +[`distance_to_optimal()`][amltk.optimization.Metric.distance_to_optimal] +function which is the distance to the optimal value. + +In the case of optimization, we provide a +[`normalized_loss()`][amltk.optimization.Metric.normalized_loss] which +normalized the value to be a minimization loss, that is also bounded +if the metric itself is bounded. + +```python exec="true" source="material-block" result="python" +from amltk.optimization import Metric + +acc = Metric("accuracy", minimize=False, bounds=(0, 100)) + +print(f"Distance: {acc.distance_to_optimal(90)}") # Distance to optimal. +print(f"Loss: {acc.loss(90)}") # Something that can be minimized +print(f"Score: {acc.score(90)}") # Something that can be maximized +print(f"Normalized loss: {acc.normalized_loss(90)}") # Normalized loss +``` diff --git a/docs/reference/optimization/optimizers.md b/docs/reference/optimization/optimizers.md index 777b5d38..7573447b 100644 --- a/docs/reference/optimization/optimizers.md +++ b/docs/reference/optimization/optimizers.md @@ -40,29 +40,28 @@ the [`Report`][amltk.optimization.Trial.Report], as this will be different for e to worry that the internal state of the optimizer is updated accordingly to these two _"Ask"_ and _"Tell"_ events and that's it. -For a reference on implementing an optimizer you can refer to any of the following: - - -## SMAC - -::: amltk.optimization.optimizers.smac - options: - members: false - -## NePs - -::: amltk.optimization.optimizers.neps - options: - members: false - -## Optuna - -::: amltk.optimization.optimizers.optuna - options: - members: false +For a reference on implementing an optimizer you can refer to any of the following +API Docs: +* [SMAC][amltk.optimization.optimizers.smac] +* [NePs][amltk.optimization.optimizers.neps] +* [Optuna][amltk.optimization.optimizers.optuna] +* [Random Search][amltk.optimization.optimizers.random_search] ## Integrating your own - -::: amltk.optimization.optimizer - options: - members: false +The base [`Optimizer`][amltk.optimization.optimizer.Optimizer] class, +defines the API we require optimizers to implement. + +* [`ask()`][amltk.optimization.optimizer.Optimizer.ask] - Ask the optimizer for a + new [`Trial`][amltk.optimization.trial.Trial] to evaluate. +* [`tell()`][amltk.optimization.optimizer.Optimizer.tell] - Tell the optimizer + the result of the sampled config. This comes in the form of a + [`Trial.Report`][amltk.optimization.trial.Trial.Report]. + +Additionally, to aid users from switching between optimizers, the +[`preferred_parser()`][amltk.optimization.optimizer.Optimizer.preferred_parser] +method should return either a `parser` function or a string that can be used +with [`node.search_space(parser=..._)`][amltk.pipeline.Node.search_space] to +extract the search space for the optimizer. + +Please refer to the code of [Random Search][amltk.optimization.optimizers.random_search] +on github for an example of how to implement a new optimizer. diff --git a/docs/reference/optimization/profiling.md b/docs/reference/optimization/profiling.md index 7a2b3147..5755a0cf 100644 --- a/docs/reference/optimization/profiling.md +++ b/docs/reference/optimization/profiling.md @@ -1,5 +1,77 @@ ## Profiling +Whether for debugging, building an AutoML system or for optimization +purposes, we provide a powerful [`Profiler`][amltk.profiling.Profiler], +which can generate a [`Profile`][amltk.profiling.Profile] of different sections +of code. This is particularly useful with [`Trial`][amltk.optimization.Trial]s, +so much so that we attach one to every `Trial` made as +[`trial.profiler`][amltk.optimization.Trial.profiler]. -:: amltk.profiling.profiler - options: - members: False +When done profiling, you can export all generated profiles as a dataframe using +[`profiler.df()`][amltk.profiling.Profiler.df]. + +```python exec="true" result="python" source="material-block" +from amltk.profiling import Profiler +import numpy as np + +profiler = Profiler() + +with profiler("loading-data"): + X = np.random.rand(1000, 1000) + +with profiler("training-model"): + model = np.linalg.inv(X) + +with profiler("predicting"): + y = model @ X + +print(profiler.df()) +``` + +You'll find these profiles as keys in the [`Profiler`][amltk.profiling.Profiler], +e.g. `#! python profiler["loading-data"]`. + +This will measure both the time it took within the block but also +the memory consumed before and after the block finishes, allowing +you to get an estimate of the memory consumed. + + +??? tip "Memory, vms vs rms" + + While not entirely accurate, this should be enough for info + for most use cases. + + Given the main process uses 2GB of memory and the process + then spawns a new process in which you are profiling, as you + might do from a [`Task`][amltk.scheduling.Task]. In this new + process you use another 2GB on top of that, then: + + * The virtual memory size (**vms**) will show 4GB as the + new process will share the 2GB with the main process and + have it's own 2GB. + + * The resident set size (**rss**) will show 2GB as the + new process will only have 2GB of it's own memory. + + +If you need to profile some iterator, like a for loop, you can use +[`Profiler.each()`][amltk.profiling.Profiler.each] which will measure +the entire loop but also each individual iteration. This can be useful +for iterating batches of a deep-learning model, splits of a cross-validator +or really any loop with work you want to profile. + +```python exec="true" result="python" source="material-block" +from amltk.profiling import Profiler +import numpy as np + +profiler = Profiler() + +for i in profiler.each(range(3), name="for-loop"): + X = np.random.rand(1000, 1000) + +print(profiler.df()) +``` + +Lastly, to disable profiling without editing much code, +you can always use [`Profiler.disable()`][amltk.profiling.Profiler.disable] +and [`Profiler.enable()`][amltk.profiling.Profiler.enable] to toggle +profiling on and off. diff --git a/docs/reference/optimization/trials.md b/docs/reference/optimization/trials.md index b71a41a0..cccdfbb9 100644 --- a/docs/reference/optimization/trials.md +++ b/docs/reference/optimization/trials.md @@ -1,11 +1,197 @@ -## Trial +## Trial and Report -::: amltk.optimization.trial - options: - members: False +[`Trial`][amltk.optimization.trial.Trial] - typically the output of +[`Optimizer.ask()`][amltk.optimization.Optimizer.ask], indicating +what the optimizer would like to evaluate next. +e provide a host of convenience methods attached to the `Trial` to make it easy to +save results, store artifacts, and more. -### History +[`Trial.Report`][amltk.optimization.trial.Trial.Report] - +the output of a [`trial.success(cost=...)`][amltk.optimization.trial.Trial.success] or +[`trial.fail(cost=...)`][amltk.optimization.trial.Trial.fail] call. +Provides an easy way to report back to the optimizer's +[`tell()`][amltk.optimization.Optimizer.tell]. -::: amltk.optimization.history - options: - members: False + + +### Trial +A [`Trial`][amltk.optimization.Trial] encapsulates some configuration +that needs to be evaluated. Typically, this is what is generated by an +[`Optimizer.ask()`][amltk.optimization.Optimizer.ask] call. + +- [`trial.success()`][amltk.optimization.Trial.success] to generate a +success [`Report`][amltk.optimization.Trial.Report], typically +passing what your chosen optimizer expects, e.g., `"loss"` or `"cost"`. + +- [`trial.fail()`][amltk.optimization.Trial.fail] to generate a +failure [`Report`][amltk.optimization.Trial.Report]. +If an exception is passed to `fail()`, it will be attached to the report along with any traceback it can deduce. +Each [`Optimizer`][amltk.optimization.Optimizer] will take care of what to do from here. + +```python exec="true" source="material-block" result="python" +from amltk.optimization import Trial, Metric +from amltk.store import PathBucket + +cost = Metric("cost", minimize=True) + +def target_function(trial: Trial) -> Trial.Report: + x = trial.config["x"] + y = trial.config["y"] + + with trial.profile("expensive-calculation"): + cost = x**2 - y + + return trial.success(cost=cost) + +# ... usually obtained from an optimizer +trial = Trial.create( + name="some-unique-name", + config={"x": 1, "y": 2}, + metrics=[cost] +) + +report = target_function(trial) +print(report.df()) +trial.bucket.rmdir() # markdown-exec: hide +``` + + +What you can return with [`trial.success()`][amltk.optimization.Trial.success] +or [`trial.fail()`][amltk.optimization.Trial.fail] depends on the +[`metrics`][amltk.optimization.Trial.metrics] of the trial. Typically, +an optimizer will provide the trial with the list of [metrics](../optimization/metrics.md) + +Some important properties are that they have a unique +[`.name`][amltk.optimization.Trial.name] given the optimization run, +a candidate [`.config`][amltk.optimization.Trial.config] to evaluate, +a possible [`.seed`][amltk.optimization.Trial.seed] to use, +and an [`.info`][amltk.optimization.Trial.info] object, which is the optimizer +specific information, if required by you. + +!!! tip "Reporting success (or failure)" + + When using the [`success()`][amltk.optimization.trial.Trial.success] + method, make sure to provide values for all metrics specified in the + [`.metrics`][amltk.optimization.Trial.metrics] attribute. + Usually these are set by the optimizer generating the `Trial`. + + If you instead report using [`fail()`][amltk.optimization.trial.Trial.success], + any metric not specified will be set to the + [`.worst`][amltk.optimization.Metric.worst] value of the metric. + + Each metric has a unique name, and it's crucial to use the correct names when + reporting success, otherwise an error will occur. + + ??? example "Reporting success for metrics" + + For example: + + ```python exec="true" result="python" source="material-block" + from amltk.optimization import Trial, Metric + + # Gotten from some optimizer usually, i.e. via `optimizer.ask()` + trial = Trial.create( + name="example_trial", + config={"param": 42}, + metrics=[Metric(name="accuracy", minimize=False)] + ) + + # Incorrect usage (will raise an error) + try: + report = trial.success(invalid_metric=0.95) + except ValueError as error: + print(error) + + # Correct usage + report = trial.success(accuracy=0.95) + trial.bucket.rmdir() # markdown-exec: hide + ``` + +If using [`Plugins`][amltk.scheduling.plugins.Plugin], they may insert +some extra objects in the [`.extra`][amltk.optimization.Trial.extras] dict. + +To profile your trial, you can wrap the logic you'd like to check with +[`trial.profile()`][amltk.optimization.Trial.profile], which will automatically +profile the block of code for memory before and after as well as time taken. + +If you've [`profile()`][amltk.optimization.Trial.profile]'ed any intervals, +you can access them by name through +[`trial.profiles`][amltk.optimization.Trial.profiles]. +Please see the [`Profiler`][amltk.profiling.profiler.Profiler] +for more. + +??? example "Profiling with a trial." + + ```python exec="true" source="material-block" result="python" title="profile" + from amltk.optimization import Trial + + trial = Trial.create(name="some-unique-name", config={}) + + # ... somewhere where you've begun your trial. + with trial.profile("some_interval"): + for work in range(100): + pass + + print(trial.profiler.df()) + trial.bucket.rmdir() # markdown-exec: hide + ``` + +You can also record anything you'd like into the +[`.summary`][amltk.optimization.Trial.summary], a plain `#!python dict` +or use [`trial.store()`][amltk.optimization.Trial.store] to store artifacts +related to the trial. + +??? tip "What to put in `.summary`?" + + For large items, e.g. predictions or models, these are highly advised to + [`.store()`][amltk.optimization.Trial.store] to disk, especially if using + a `Task` for multiprocessing. + + Further, if serializing the report using the + [`report.df()`][amltk.optimization.Trial.Report.df], + returning a single row, + or a [`History`][amltk.optimization.History] + with [`history.df()`][amltk.optimization.History.df] for a dataframe consisting + of many of the reports, then you'd likely only want to store things + that are scalar and can be serialised to disk by a pandas DataFrame. + + +### Report +The [`Trial.Report`][amltk.optimization.Trial.Report] encapsulates +a [`Trial`][amltk.optimization.Trial], its status and any metrics/exceptions +that may have occured. + +Typically you will not create these yourself, but instead use +[`trial.success()`][amltk.optimization.Trial.success] or +[`trial.fail()`][amltk.optimization.Trial.fail] to generate them. + +```python exec="true" source="material-block" result="python" +from amltk.optimization import Trial, Metric + +loss = Metric("loss", minimize=True) + +trial = Trial.create(name="trial", config={"x": 1}, metrics=[loss]) + +with trial.profile("fitting"): + # Do some work + # ... + report = trial.success(loss=1) + +print(report.df()) +trial.bucket.rmdir() # markdown-exec: hide +``` + +These reports are used to report back metrics to an +[`Optimizer`][amltk.optimization.Optimizer] +with [`Optimizer.tell()`][amltk.optimization.Optimizer.tell] but can also be +stored for your own uses. + +You can access the original trial with the +[`.trial`][amltk.optimization.Trial.Report.trial] attribute, and the +[`Status`][amltk.optimization.Trial.Status] of the trial with the +[`.status`][amltk.optimization.Trial.Report.status] attribute. + +You may also want to check out the [`History`][amltk.optimization.History] class +for storing a collection of `Report`s, allowing for an easier time to convert +them to a dataframe or perform some common Hyperparameter optimization parsing +of metrics. diff --git a/docs/reference/pipelines/spaces.md b/docs/reference/pipelines/spaces.md index a89a63f5..7a1953c6 100644 --- a/docs/reference/pipelines/spaces.md +++ b/docs/reference/pipelines/spaces.md @@ -32,14 +32,6 @@ from amltk._doc import doc_print; doc_print(print, c) # markdown-exec: hide What follow's below is a list of supported parsers you could pass `parser=` to extract a search space representation. -## ConfigSpace - -::: amltk.pipeline.parsers.configspace - options: - members: false - -## Optuna - -::: amltk.pipeline.parsers.optuna - options: - members: false +* [`ConfigSpace`][amltk.pipeline.parsers.configspace] - A parser for the + [ConfigSpace](https://automl.github.io/ConfigSpace/master/) library. +* [`Optuna`][amltk.pipeline.parsers.optuna] - A parser specifically for optuna. diff --git a/docs/reference/scheduling/events.md b/docs/reference/scheduling/events.md index 53f61ac8..58ff103e 100644 --- a/docs/reference/scheduling/events.md +++ b/docs/reference/scheduling/events.md @@ -1,5 +1,239 @@ ## Events +One of the primary ways to respond to `@events` emitted +with by a [`Task`][amltk.scheduling.Task] +the [`Scheduler`][amltk.scheduling.Scheduler] +is through use of a **callback**. -::: amltk.scheduling.events - options: - members: False +The reason for this is to enable an easier time for API's to utilize +multiprocessing and remote compute from the `Scheduler`, without having +to burden users with knowing the details of how to use multiprocessing. + +A callback subscribes to some event using a decorator but can also be done in +a functional style if preferred. The below example is based on the +event [`@scheduler.on_start`][amltk.scheduling.Scheduler.on_start] but +the same applies to all events. + +=== "Decorators" + + ```python exec="true" source="material-block" html="true" + from amltk.scheduling import Scheduler + + scheduler = Scheduler.with_processes(1) + + @scheduler.on_start + def print_hello() -> None: + print("hello") + + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler, fontsize="small") # markdown-exec: hide + ``` + +=== "Functional" + + ```python exec="true" source="material-block" html="true" + from amltk.scheduling import Scheduler + + scheduler = Scheduler.with_processes(1) + + def print_hello() -> None: + print("hello") + + scheduler.on_start(print_hello) + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler, fontsize="small") # markdown-exec: hide + ``` + +There are a number of ways to customize the behaviour of these callbacks, notably +to control how often they get called and when they get called. + +??? tip "Callback customization" + + + === "`on('event', repeat=...)`" + + This will cause the callback to be called `repeat` times successively. + This is most useful in combination with + [`@scheduler.on_start`][amltk.scheduling.Scheduler.on_start] to launch + a number of tasks at the start of the scheduler. + + ```python exec="true" source="material-block" html="true" hl_lines="11" + from amltk import Scheduler + + N_WORKERS = 2 + + def f(x: int) -> int: + return x * 2 + from amltk._doc import make_picklable; make_picklable(f) # markdown-exec: hide + + scheduler = Scheduler.with_processes(N_WORKERS) + task = scheduler.task(f) + + @scheduler.on_start(repeat=N_WORKERS) + def on_start(): + task.submit(1) + + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler, fontsize="small") # markdown-exec: hide + ``` + + === "`on('event', max_calls=...)`" + + Limit the number of times a callback can be called, after which, the callback + will be ignored. + + ```python exec="true" source="material-block" html="True" hl_lines="13" + from asyncio import Future + from amltk.scheduling import Scheduler + + scheduler = Scheduler.with_processes(2) + + def expensive_function(x: int) -> int: + return x ** 2 + from amltk._doc import make_picklable; make_picklable(expensive_function) # markdown-exec: hide + + @scheduler.on_start + def submit_calculations() -> None: + scheduler.submit(expensive_function, 2) + + @scheduler.on_future_result(max_calls=3) + def print_result(future, result) -> None: + scheduler.submit(expensive_function, 2) + + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide + ``` + + === "`on('event', when=...)`" + + A callable which takes no arguments and returns a `bool`. The callback + will only be called when the `when` callable returns `True`. + + Below is a rather contrived example, but it shows how we can use the + `when` parameter to control when the callback is called. + + ```python exec="true" source="material-block" html="True" hl_lines="8 12" + import random + from amltk.scheduling import Scheduler + + LOCALE = random.choice(["English", "German"]) + + scheduler = Scheduler.with_processes(1) + + @scheduler.on_start(when=lambda: LOCALE == "English") + def print_hello() -> None: + print("hello") + + @scheduler.on_start(when=lambda: LOCALE == "German") + def print_guten_tag() -> None: + print("guten tag") + + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide + ``` + + === "`on('event', every=...)`" + + Only call the callback every `every` times the event is emitted. This + includes the first time it's called. + + ```python exec="true" source="material-block" html="True" hl_lines="6" + from amltk.scheduling import Scheduler + + scheduler = Scheduler.with_processes(1) + + # Print "hello" only every 2 times the scheduler starts. + @scheduler.on_start(every=2) + def print_hello() -> None: + print("hello") + + # Run the scheduler 5 times + scheduler.run() + scheduler.run() + scheduler.run() + scheduler.run() + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide + ``` + +### Emitter, Subscribers and Events +This part of the documentation is not necessary to understand or use for AMLTK. People +wishing to build tools upon AMLTK may still find this a useful component to add to their +arsenal. + +The core of making this functionality work is the [`Emitter`][amltk.scheduling.events.Emitter]. +Its purpose is to have `@events` that can be emitted and subscribed to. Classes like the +[`Scheduler`][amltk.scheduling.Scheduler] and [`Task`][amltk.scheduling.Task] carry +around with them an `Emitter` to enable all of this functionality. + +Creating an `Emitter` is rather straight-forward, but we must also create +[`Events`][amltk.scheduling.events.Event] that people can subscribe to. + +```python +from amltk.scheduling import Emitter, Event +emitter = Emitter("my-emitter") + +event: Event[int] = Event("my-event") # (1)! + +@emitter.on(event) +def my_callback(x: int) -> None: + print(f"Got {x}!") + +emitter.emit(event, 42) # (2)! +``` + +1. The typing `#!python Event[int]` is used to indicate that the event will be emitting + an integer. This is not necessary, but it is useful for type-checking and + documentation. +2. The `#!python emitter.emit(event, 42)` is used to emit the event. This will call + all the callbacks registered for the event, i.e. `#!python my_callback()`. + +!!! warning "Independent Events" + + Given a single `Emitter` and a single instance of an `Event`, there is no way to + have different `@events` for callbacks. There are two options, both used extensively + in AMLTK. + + The first is to have different `Events` quite naturally, i.e. you distinguish + between different things that can happen. However, you often want to have different + objects emit the same `Event` but have different callbacks for each object. + + This makes most sense in the context of a `Task` the `Event` instances are shared as + class variables in the `Task` class, however a user likely want's to subscribe to + the `Event` for a specific instance of the `Task`. + + This is where the second option comes in, in which each object carries around its + own `Emitter` instance. This is how a user can subscribe to the same kind of `Event` + but individually for each `Task`. + + +However, to shield users from this and to create named access points for users to +subscribe to, we can use the [`Subscriber`][amltk.scheduling.events.Subscriber] class, +conveniently created by the [`Emitter.subscriber()`][amltk.scheduling.events.Emitter.subscriber] +method. + +```python +from amltk.scheduling import Emitter, Event +emitter = Emitter("my-emitter") + +class GPT: + + event: Event[str] = Event("my-event") + + def __init__(self) -> None: + self.on_answer: Subscriber[str] = emitter.subscriber(self.event) + + def ask(self, question: str) -> None: + emitter.emit(self.event, "hello world!") + +gpt = GPT() + +@gpt.on_answer +def print_answer(answer: str) -> None: + print(answer) + +gpt.ask("What is the conical way for an AI to greet someone?") +``` + +Typically these event based systems make little sense in a synchronous context, however +with the [`Scheduler`][amltk.scheduling.Scheduler] and [`Task`][amltk.scheduling.Task] +classes, they are used to enable a simple way to use multiprocessing and remote compute. diff --git a/docs/reference/scheduling/queue_monitor.md b/docs/reference/scheduling/queue_monitor.md index 42b72a37..92be0fb7 100644 --- a/docs/reference/scheduling/queue_monitor.md +++ b/docs/reference/scheduling/queue_monitor.md @@ -1,5 +1,64 @@ ## Queue Monitor +A [`QueueMonitor`][amltk.scheduling.queue_monitor.QueueMonitor] is a +monitor for the scheduler queue. -::: amltk.scheduling.queue_monitor - options: - members: False +This module contains a monitor for the scheduler queue. The monitor tracks the +queue state at every event emitted by the scheduler. The data can be converted +to a pandas DataFrame or plotted as a stacked barchart. + +!!! note "Monitoring Frequency" + + To prevent repeated polling, we sample the scheduler queue at every scheduler event. + This is because the queue is only modified upon one of these events. This means we + don't need to poll the queue at a fixed interval. However, if you need more fine + grained updates, you can add extra events/timings at which the monitor should + [`update()`][amltk.scheduling.queue_monitor.QueueMonitor.update]. + +!!! warning "Performance impact" + + If your tasks and callbacks are very fast (~sub 10ms), then the monitor has a + non-nelgible impact however for most use cases, this should not be a problem. + As anything, you should profile how much work the scheduler can get done, + with and without the monitor, to see if it is a problem for your use case. + +In the below example, we have a very fast running function that runs on repeat, +sometimes too fast for the scheduler to keep up, letting some futures buildup needing +to be processed. + +```python exec="true" source="material-block" result="python" session="queue-monitor" +import time +import matplotlib.pyplot as plt +from amltk.scheduling import Scheduler +from amltk.scheduling.queue_monitor import QueueMonitor + +def fast_function(x: int) -> int: + return x + 1 +from amltk._doc import make_picklable; make_picklable(fast_function) # markdown-exec: hide + +N_WORKERS = 2 +scheduler = Scheduler.with_processes(N_WORKERS) +monitor = QueueMonitor(scheduler) +task = scheduler.task(fast_function) + +@scheduler.on_start(repeat=N_WORKERS) +def start(): + task.submit(1) + +@task.on_result +def result(_, x: int): + if scheduler.running(): + task.submit(x) + +scheduler.run(timeout=1) +df = monitor.df() +print(df) +``` + +We can also [`plot()`][amltk.scheduling.queue_monitor.QueueMonitor.plot] the data as a +stacked barchart with a set interval. + +```python exec="true" source="material-block" html="true" session="queue-monitor" +fig, ax = plt.subplots() +monitor.plot(interval=(50, "ms")) +from io import StringIO; fig.tight_layout(); buffer = StringIO(); plt.savefig(buffer, format="svg"); print(buffer.getvalue()) # markdown-exec: hide +``` diff --git a/docs/reference/scheduling/scheduler.md b/docs/reference/scheduling/scheduler.md index d5a6a2f9..498c11aa 100644 --- a/docs/reference/scheduling/scheduler.md +++ b/docs/reference/scheduling/scheduler.md @@ -1,5 +1,284 @@ ## Scheduler +The [`Scheduler`][amltk.scheduling.Scheduler] uses +an [`Executor`][concurrent.futures.Executor], a builtin python native with +a `#!python submit(f, *args, **kwargs)` function to submit compute to +be compute else where, whether it be locally or remotely. -::: amltk.scheduling.scheduler - options: - members: False +The `Scheduler` is primarily used to dispatch compute to an `Executor` and +emit `@events`, which can trigger user callbacks. + +Typically you should not use the `Scheduler` directly for dispatching and +responding to computed functions, but rather use a [`Task`][amltk.scheduling.Task] + +??? note "Running in a Jupyter Notebook/Colab" + + If you are using a Jupyter Notebook, you likley need to use the following + at the top of your notebook: + + ```python + import nest_asyncio # Only necessary in Notebooks + nest_asyncio.apply() + + scheduler.run(...) + ``` + + This is due to the fact a notebook runs in an async context. If you do not + wish to use the above snippet, you can instead use: + + ```python + await scheduler.async_run(...) + ``` + +??? tip "Basic Usage" + + In this example, we create a scheduler that uses local processes as + workers. We then create a task that will run a function `fn` and submit it + to the scheduler. Lastly, a callback is registered to `@future-result` to print the + result when the compute is done. + + ```python exec="true" source="material-block" html="true" + from amltk.scheduling import Scheduler + + def fn(x: int) -> int: + return x + 1 + from amltk._doc import make_picklable; make_picklable(fn) # markdown-exec: hide + + scheduler = Scheduler.with_processes(1) + + @scheduler.on_start + def launch_the_compute(): + scheduler.submit(fn, 1) + + @scheduler.on_future_result + def callback(future, result): + print(f"Result: {result}") + + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler) # markdown-exec: hide + ``` + + The last line in the previous example called + [`scheduler.run()`][amltk.scheduling.Scheduler.run] is what starts the scheduler + running, in which it will first emit the `@start` event. This triggered the + callback `launch_the_compute()` which submitted the function `fn` with the + arguments `#!python 1`. + + The scheduler then ran the compute and waited for it to complete, emitting the + `@future-result` event when it was done successfully. This triggered the callback + `callback()` which printed the result. + + At this point, there is no more compute happening and no more events to respond to + so the scheduler will halt. + +??? example "`@events`" + + === "Scheduler Status Events" + + When the scheduler enters some important state, it will emit an event + to let you know. + + === "`@start`" + + ::: amltk.scheduling.Scheduler.on_start + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@finishing`" + + ::: amltk.scheduling.Scheduler.on_finishing + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@finished`" + + ::: amltk.scheduling.Scheduler.on_finished + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@stop`" + + ::: amltk.scheduling.Scheduler.on_stop + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@timeout`" + + ::: amltk.scheduling.Scheduler.on_timeout + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@empty`" + + ::: amltk.scheduling.Scheduler.on_empty + options: + show_root_heading: False + show_root_toc_entry: False + + === "Submitted Compute Events" + + When any compute goes through the `Scheduler`, it will emit an event + to let you know. You should however prefer to use a + [`Task`][amltk.scheduling.Task] as it will emit specific events + for the task at hand, and not all compute. + + === "`@future-submitted`" + + ::: amltk.scheduling.Scheduler.on_future_submitted + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@future-result`" + + ::: amltk.scheduling.Scheduler.on_future_result + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@future-exception`" + + ::: amltk.scheduling.Scheduler.on_future_exception + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@future-done`" + + ::: amltk.scheduling.Scheduler.on_future_done + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@future-cancelled`" + + ::: amltk.scheduling.Scheduler.on_future_cancelled + options: + show_root_heading: False + show_root_toc_entry: False + + +??? tip "Common usages of `run()`" + + There are various ways to [`run()`][amltk.scheduling.Scheduler.run] the + scheduler, notably how long it should run with `timeout=` and also how + it should react to any exception that may have occurred within the `Scheduler` + itself or your callbacks. + + Please see the [`run()`][amltk.scheduling.Scheduler.run] API doc for more + details and features, however we show two common use cases of using the `timeout=` + parameter. + + You can render a live display using [`run(display=...)`][amltk.scheduling.Scheduler.run]. + This require [`rich`](https://github.com/Textualize/rich) to be installed. You + can install this with `#!bash pip install rich` or `#!bash pip install amltk[rich]`. + + + === "`run(timeout=...)`" + + You can tell the `Scheduler` to stop after a certain amount of time + with the `timeout=` argument to [`run()`][amltk.scheduling.Scheduler.run]. + + This will also trigger the `@timeout` event as seen in the `Scheduler` output. + + ```python exec="true" source="material-block" html="True" hl_lines="19" + import time + from asyncio import Future + + from amltk.scheduling import Scheduler + + scheduler = Scheduler.with_processes(1) + + def expensive_function() -> int: + time.sleep(0.1) + return 42 + from amltk._doc import make_picklable; make_picklable(expensive_function) # markdown-exec: hide + + @scheduler.on_start + def submit_calculations() -> None: + scheduler.submit(expensive_function) + + # This will endlessly loop the scheduler + @scheduler.on_future_done + def submit_again(future: Future) -> None: + if scheduler.running(): + scheduler.submit(expensive_function) + + scheduler.run(timeout=1) # End after 1 second + from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide + ``` + + === "`run(timeout=..., wait=False)`" + + By specifying that the `Scheduler` should not wait for ongoing tasks + to finish, the `Scheduler` will attempt to cancel and possibly terminate + any running tasks. + + ```python exec="true" source="material-block" html="True" + import time + from amltk.scheduling import Scheduler + + scheduler = Scheduler.with_processes(1) + + def expensive_function() -> None: + time.sleep(10) + + from amltk._doc import make_picklable; make_picklable(expensive_function) # markdown-exec: hide + + @scheduler.on_start + def submit_calculations() -> None: + scheduler.submit(expensive_function) + + scheduler.run(timeout=1, wait=False) # End after 1 second + from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide + ``` + + ??? info "Forcibly Terminating Workers" + + As an `Executor` does not provide an interface to forcibly + terminate workers, we provide `Scheduler(terminate=...)` as a custom + strategy for cleaning up a provided executor. It is not possible + to terminate running thread based workers, for example using + `ThreadPoolExecutor` and any Executor using threads to spawn + tasks will have to wait until all running tasks are finish + before python can close. + + It's likely `terminate` will trigger the `EXCEPTION` event for + any tasks that are running during the shutdown, **not*** + a cancelled event. This is because we use a + [`Future`][concurrent.futures.Future] + under the hood and these can not be cancelled once running. + However there is no guarantee of this and is up to how the + `Executor` handles this. + +??? example "Scheduling something to be run later" + + You can schedule some function to be run later using the + [`#!python scheduler.call_later()`][amltk.scheduling.Scheduler.call_later] method. + + !!! note + + This does not run the function in the background, it just schedules some + function to be called later, where you could perhaps then use submit to + scheduler a [`Task`][amltk.scheduling.Task] to run the function in the + background. + + ```python exec="true" source="material-block" result="python" + from amltk.scheduling import Scheduler + + scheduler = Scheduler.with_processes(1) + + def fn() -> int: + print("Ending now!") + scheduler.stop() + + @scheduler.on_start + def schedule_fn() -> None: + scheduler.call_later(1, fn) + + scheduler.run(end_on_empty=False) + ``` diff --git a/docs/reference/scheduling/task.md b/docs/reference/scheduling/task.md index f7fdd111..51b0ccbe 100644 --- a/docs/reference/scheduling/task.md +++ b/docs/reference/scheduling/task.md @@ -1,5 +1,88 @@ ## Tasks +A [`Task`][amltk.scheduling.task.Task] is a unit of work that can be scheduled by the +[`Scheduler`][amltk.scheduling.Scheduler]. -::: amltk.scheduling.task - options: - members: False +It is defined by its `function=` to call. Whenever a `Task` +has its [`submit()`][amltk.scheduling.task.Task.submit] method called, +the function will be dispatched to run by a `Scheduler`. + +When a task has returned, either successfully, or with an exception, +it will emit `@events` to indicate so. You can subscribe to these events +with callbacks and act accordingly. + + +??? example "`@events`" + + Check out the `@events` reference + for more on how to customize these callbacks. You can also take a look + at the API of [`on()`][amltk.scheduling.task.Task.on] for more information. + + === "`@on-result`" + + ::: amltk.scheduling.task.Task.on_result + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@on-exception`" + + ::: amltk.scheduling.task.Task.on_exception + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@on-done`" + + ::: amltk.scheduling.task.Task.on_done + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@on-submitted`" + + ::: amltk.scheduling.task.Task.on_submitted + options: + show_root_heading: False + show_root_toc_entry: False + + === "`@on-cancelled`" + + ::: amltk.scheduling.task.Task.on_cancelled + options: + show_root_heading: False + show_root_toc_entry: False + +??? tip "Usage" + + The usual way to create a task is with + [`Scheduler.task()`][amltk.scheduling.scheduler.Scheduler.task], + where you provide the `function=` to call. + + ```python exec="true" source="material-block" html="true" + from amltk import Scheduler + from asyncio import Future + + def f(x: int) -> int: + return x * 2 + from amltk._doc import make_picklable; make_picklable(f) # markdown-exec: hide + + scheduler = Scheduler.with_processes(2) + task = scheduler.task(f) + + @scheduler.on_start + def on_start(): + task.submit(1) + + @task.on_result + def on_result(future: Future[int], result: int): + print(f"Task {future} returned {result}") + + scheduler.run() + from amltk._doc import doc_print; doc_print(print, scheduler) # markdown-exec: hide + ``` + + If you'd like to simply just call the original function, without submitting it to + the scheduler, you can always just call the task directly, i.e. `#!python task(1)`. + +You can also provide [`Plugins`][amltk.scheduling.plugins.Plugin] to the task, +to modify tasks, add functionality and add new events. diff --git a/src/amltk/_doc.py b/src/amltk/_doc.py index 59c045c3..da766aa1 100644 --- a/src/amltk/_doc.py +++ b/src/amltk/_doc.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib import os from collections.abc import Callable from functools import lru_cache @@ -47,17 +48,17 @@ def link(obj: Any) -> str | None: return _try_get_link(fullname(obj)) -def make_picklable(thing: Any, name: str | None = None) -> None: +def make_picklable(thing: Any) -> None: """This is hack to make the examples code with schedulers work. Scheduler uses multiprocessing and multiprocessing requires that all objects passed to the scheduler are picklable. This is not the case for the classes/functions defined in the example code. """ - import __main__ + thing_module = thing.__module__ - _name = thing.__name__ if name is None else name - setattr(__main__, _name, thing) + _mod = importlib.import_module(thing_module) + setattr(_mod, thing.__name__, thing) def as_rich_svg( diff --git a/src/amltk/metalearning/dataset_distances.py b/src/amltk/metalearning/dataset_distances.py index cc99a61e..4d29fe79 100644 --- a/src/amltk/metalearning/dataset_distances.py +++ b/src/amltk/metalearning/dataset_distances.py @@ -1,92 +1,4 @@ -"""One common way to define how similar two datasets are is to compute some "similarity" -between them. This notion of "similarity" requires computing some features of a dataset -(**metafeatures**) first, such that we can numerically compute some distance function. - -Let's see how we can quickly compute the distance between some datasets with -[`dataset_distance()`][amltk.metalearning.dataset_distance]! - -```python exec="true" source="material-block" result="python" title="Dataset Distances P.1" session='dd' -import pandas as pd -import openml - -from amltk.metalearning import compute_metafeatures - -def get_dataset(dataset_id: int) -> tuple[pd.DataFrame, pd.Series]: - dataset = openml.datasets.get_dataset( - dataset_id, - download_data=True, - download_features_meta_data=False, - download_qualities=False, - ) - X, y, _, _ = dataset.get_data( - dataset_format="dataframe", - target=dataset.default_target_attribute, - ) - return X, y - -d31 = get_dataset(31) -d3 = get_dataset(3) -d4 = get_dataset(4) - -metafeatures_dict = { - "dataset_31": compute_metafeatures(*d31), - "dataset_3": compute_metafeatures(*d3), - "dataset_4": compute_metafeatures(*d4), -} - -metafeatures = pd.DataFrame(metafeatures_dict) -print(metafeatures) -``` - -Now we want to know which one of `#!python "dataset_3"` or `#!python "dataset_4"` is -more _similar_ to `#!python "dataset_31"`. - -```python exec="true" source="material-block" result="python" title="Dataset Distances P.2" session='dd' -from amltk.metalearning import dataset_distance - -target = metafeatures_dict.pop("dataset_31") -others = metafeatures_dict - -distances = dataset_distance(target, others, distance_metric="l2") -print(distances) -``` - -Seems like `#!python "dataset_3"` is some notion of closer to `#!python "dataset_31"` -than `#!python "dataset_4"`. However the scale of the metafeatures are not exactly all close. -For example, many lie between `#!python (0, 1)` but some like `instance_count` can completely -dominate the show. - -Lets repeat the computation but specify that we should apply a `#!python "minmax"` scaling -across the rows. - -```python exec="true" source="material-block" result="python" title="Dataset Distances P.3" session='dd' hl_lines="5" -distances = dataset_distance( - target, - others, - distance_metric="l2", - scaler="minmax" -) -print(distances) -``` - -Now `#!python "dataset_3"` is considered more similar but the difference between the two is a lot less -dramatic. In general, applying some scaling to values of different scales is required for metalearning. - -You can also use an [sklearn.preprocessing.MinMaxScaler][] or anything other scaler from scikit-learn -for that matter. - -```python exec="true" source="material-block" result="python" title="Dataset Distances P.3" session='dd' hl_lines="7" -from sklearn.preprocessing import MinMaxScaler - -distances = dataset_distance( - target, - others, - distance_metric="l2", - scaler=MinMaxScaler() -) -print(distances) -``` -""" # noqa: E501 +"""Calculating metadata distances.""" from __future__ import annotations import warnings diff --git a/src/amltk/metalearning/metafeatures.py b/src/amltk/metalearning/metafeatures.py index 6bea8a6c..75ee1bdd 100644 --- a/src/amltk/metalearning/metafeatures.py +++ b/src/amltk/metalearning/metafeatures.py @@ -1,140 +1,4 @@ -'''A [`MetaFeature`][amltk.metalearning.MetaFeature] is some -statistic about a dataset/task, that can be used to make datasets or -tasks more comparable, thus enabling meta-learning methods. - -Calculating meta-features of a dataset is quite straight foward. - -```python exec="true" source="material-block" result="python" title="Metafeatures" hl_lines="10" -import openml -from amltk.metalearning import compute_metafeatures - -dataset = openml.datasets.get_dataset( - 31, # credit-g - download_data=True, - download_features_meta_data=False, - download_qualities=False, -) -X, y, _, _ = dataset.get_data( - dataset_format="dataframe", - target=dataset.default_target_attribute, -) - -mfs = compute_metafeatures(X, y) - -print(mfs) -``` - -By default [`compute_metafeatures()`][amltk.metalearning.compute_metafeatures] will -calculate all the [`MetaFeature`][amltk.metalearning.MetaFeature] implemented, -iterating through their subclasses to do so. You can pass an explicit list -as well to `compute_metafeatures(X, y, features=[...])`. - -To implement your own is also quite straight forward: - -```python exec="true" source="material-block" result="python" title="Create Metafeature" hl_lines="10 11 12 13 14 15 16 17 18 19" -from amltk.metalearning import MetaFeature, compute_metafeatures -import openml - -dataset = openml.datasets.get_dataset( - 31, # credit-g - download_data=True, - download_features_meta_data=False, - download_qualities=False, -) -X, y, _, _ = dataset.get_data( - dataset_format="dataframe", - target=dataset.default_target_attribute, -) - -class TotalValues(MetaFeature): - - @classmethod - def compute( - cls, - x: pd.DataFrame, - y: pd.Series | pd.DataFrame, - dependancy_values: dict, - ) -> int: - return int(x.shape[0] * x.shape[1]) - -mfs = compute_metafeatures(X, y, features=[TotalValues]) -print(mfs) -``` - -As many metafeatures rely on pre-computed dataset statistics, and they do not -need to be calculated more than once, you can specify the dependancies of -a meta feature. When a metafeature would return something other than a single -value, i.e. a `dict` or a `pd.DataFrame`, we instead call those a -[`DatasetStatistic`][amltk.metalearning.DatasetStatistic]. These will -**not** be included in the result of [`compute_metafeatures()`][amltk.metalearning.compute_metafeatures]. -These `DatasetStatistic`s will only be calculated once on a call to `compute_metafeatures()` so -they can be re-used across all `MetaFeature`s that require that dependancy. - -```python exec="true" source="material-block" result="python" title="Metafeature Dependancy" hl_lines="10 11 12 13 14 15 16 17 18 19 20 23 26 35" -from amltk.metalearning import MetaFeature, DatasetStatistic, compute_metafeatures -import openml - -dataset = openml.datasets.get_dataset( - 31, # credit-g - download_data=True, - download_features_meta_data=False, - download_qualities=False, -) -X, y, _, _ = dataset.get_data( - dataset_format="dataframe", - target=dataset.default_target_attribute, -) - -class NAValues(DatasetStatistic): - """A mask of all NA values in a dataset""" - - @classmethod - def compute( - cls, - x: pd.DataFrame, - y: pd.Series | pd.DataFrame, - dependancy_values: dict, - ) -> pd.DataFrame: - return x.isna() - - -class PercentageNA(MetaFeature): - """The percentage of values missing""" - - dependencies = (NAValues,) - - @classmethod - def compute( - cls, - x: pd.DataFrame, - y: pd.Series | pd.DataFrame, - dependancy_values: dict, - ) -> int: - na_values = dependancy_values[NAValues] - n_na = na_values.sum().sum() - n_values = int(x.shape[0] * x.shape[1]) - return float(n_na / n_values) - -mfs = compute_metafeatures(X, y, features=[PercentageNA]) -print(mfs) -``` - -To view the description of a particular `MetaFeature`, you can call -[`.description()`][amltk.metalearning.DatasetStatistic.description] -on it. Otherwise you can access all of them in the following way: - -```python exec="true" source="tabbed-left" result="python" title="Metafeature Descriptions" hl_lines="4" -from pprint import pprint -from amltk.metalearning import metafeature_descriptions - -descriptions = metafeature_descriptions() -for name, description in descriptions.items(): - print("---") - print(name) - print("---") - print(" * " + description) -``` -''' # noqa: E501 +"""Metafeatures access.""" from __future__ import annotations import logging diff --git a/src/amltk/metalearning/portfolio.py b/src/amltk/metalearning/portfolio.py index eb3bc71a..058e8469 100644 --- a/src/amltk/metalearning/portfolio.py +++ b/src/amltk/metalearning/portfolio.py @@ -1,115 +1,4 @@ -"""A portfolio in meta-learning is to a set (ordered or not) of configurations -that maximize some notion of coverage across datasets or tasks. -The intuition here is that this also means that any new dataset is also covered! - -Suppose we have the given performances of some configurations across some datasets. -```python exec="true" source="material-block" result="python" title="Initial Portfolio" -import pandas as pd - -performances = { - "c1": [90, 60, 20, 10], - "c2": [20, 10, 90, 20], - "c3": [10, 20, 40, 90], - "c4": [90, 10, 10, 10], -} -portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) -print(portfolio) -``` - -If we could only choose `#!python k=3` of these configurations on some new given dataset, which ones would -you choose and in what priority? -Here is where we can apply [`portfolio_selection()`][amltk.metalearning.portfolio_selection]! - -The idea is that we pick a subset of these algorithms that maximise some value of utility for -the portfolio. We do this by adding a single configuration from the entire set, 1-by-1 until -we reach `k`, beginning with the empty portfolio. - -Let's see this in action! - -```python exec="true" source="material-block" result="python" title="Portfolio Selection" hl_lines="12 13 14 15 16" -import pandas as pd -from amltk.metalearning import portfolio_selection - -performances = { - "c1": [90, 60, 20, 10], - "c2": [20, 10, 90, 20], - "c3": [10, 20, 40, 90], - "c4": [90, 10, 10, 10], -} -portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) - -selected_portfolio, trajectory = portfolio_selection( - portfolio, - k=3, - scaler="minmax" -) - -print(selected_portfolio) -print() -print(trajectory) -``` - -The trajectory tells us which configuration was added at each time stamp along with the utility -of the portfolio with that configuration added. However we havn't specified how _exactly_ we defined the -utility of a given portfolio. We could define our own function to do so: - -```python exec="true" source="material-block" result="python" title="Portfolio Selection Custom" hl_lines="12 13 14 20" -import pandas as pd -from amltk.metalearning import portfolio_selection - -performances = { - "c1": [90, 60, 20, 10], - "c2": [20, 10, 90, 20], - "c3": [10, 20, 40, 90], - "c4": [90, 10, 10, 10], -} -portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) - -def my_function(p: pd.DataFrame) -> float: - # Take the maximum score for each dataset and then take the mean across them. - return p.max(axis=1).mean() - -selected_portfolio, trajectory = portfolio_selection( - portfolio, - k=3, - scaler="minmax", - portfolio_value=my_function, -) - -print(selected_portfolio) -print() -print(trajectory) -``` - -This notion of reducing across all configurations for a dataset and then aggregating these is common -enough that we can also directly just define these operations and we will perform the rest. - -```python exec="true" source="material-block" result="python" title="Portfolio Selection With Reduction" hl_lines="17 18" -import pandas as pd -import numpy as np -from amltk.metalearning import portfolio_selection - -performances = { - "c1": [90, 60, 20, 10], - "c2": [20, 10, 90, 20], - "c3": [10, 20, 40, 90], - "c4": [90, 10, 10, 10], -} -portfolio = pd.DataFrame(performances, index=["dataset_1", "dataset_2", "dataset_3", "dataset_4"]) - -selected_portfolio, trajectory = portfolio_selection( - portfolio, - k=3, - scaler="minmax", - row_reducer=np.max, # This is actually the default - aggregator=np.mean, # This is actually the default -) - -print(selected_portfolio) -print() -print(trajectory) -``` -""" # noqa: E501 +"""Portfolio selection.""" from __future__ import annotations diff --git a/src/amltk/optimization/metric.py b/src/amltk/optimization/metric.py index 06cdbe42..885b28d7 100644 --- a/src/amltk/optimization/metric.py +++ b/src/amltk/optimization/metric.py @@ -1,35 +1,4 @@ -"""A [`Metric`][amltk.optimization.Metric] to let optimizers know how to -handle numeric values properly. - -A `Metric` is defined by a `.name: str` and whether it is better to `.minimize: bool` -the metric. Further, you can specify `.bounds: tuple[lower, upper]` which can -help optimizers and other code know how to treat metrics. - -To easily convert between `loss` and -`score` of some value you can use the [`loss()`][amltk.optimization.Metric.loss] -and [`score()`][amltk.optimization.Metric.score] methods. - -If the metric is bounded, you can also make use of the -[`distance_to_optimal()`][amltk.optimization.Metric.distance_to_optimal] -function which is the distance to the optimal value. - -In the case of optimization, we provide a -[`normalized_loss()`][amltk.optimization.Metric.normalized_loss] which -normalized the value to be a minimization loss, that is also bounded -if the metric itself is bounded. - -```python exec="true" source="material-block" result="python" -from amltk.optimization import Metric - -acc = Metric("accuracy", minimize=False, bounds=(0, 100)) - -print(f"Distance: {acc.distance_to_optimal(90)}") # Distance to optimal. -print(f"Loss: {acc.loss(90)}") # Something that can be minimized -print(f"Score: {acc.score(90)}") # Something that can be maximized -print(f"Normalized loss: {acc.normalized_loss(90)}") # Normalized loss -``` - -""" +"""The metric definition.""" from __future__ import annotations from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence diff --git a/src/amltk/optimization/trial.py b/src/amltk/optimization/trial.py index 2a24be0f..4ceeeed5 100644 --- a/src/amltk/optimization/trial.py +++ b/src/amltk/optimization/trial.py @@ -1,29 +1,4 @@ -"""A [`Trial`][amltk.optimization.Trial] is -typically the output of -[`Optimizer.ask()`][amltk.optimization.Optimizer.ask], indicating -what the optimizer would like to evaluate next. We provide a host -of convenience methods attached to the `Trial` to make it easy to -save results, store artifacts, and more. - -Paired with the `Trial` is the [`Trial.Report`][amltk.optimization.Trial.Report], -class, providing an easy way to report back to the optimizer's -[`tell()`][amltk.optimization.Optimizer.tell] with -a simple [`trial.success(cost=...)`][amltk.optimization.Trial.success] or -[`trial.fail(cost=...)`][amltk.optimization.Trial.fail] call.. - -### Trial - -::: amltk.optimization.trial.Trial - options: - members: False - -### Report - -::: amltk.optimization.trial.Trial.Report - options: - members: False - -""" +"""The Trial and Report class.""" from __future__ import annotations import copy @@ -78,159 +53,7 @@ @dataclass(kw_only=True) class Trial(RichRenderable, Generic[I]): - """A [`Trial`][amltk.optimization.Trial] encapsulates some configuration - that needs to be evaluated. Typically, this is what is generated by an - [`Optimizer.ask()`][amltk.optimization.Optimizer.ask] call. - - ??? tip "Usage" - - If all went smooth, your trial was successful and you can use - [`trial.success()`][amltk.optimization.Trial.success] to generate - a success [`Report`][amltk.optimization.Trial.Report], typically - passing what your chosen optimizer expects, e.g., `"loss"` or `"cost"`. - - If your trial failed, you can instead use the - [`trial.fail()`][amltk.optimization.Trial.fail] to generate a - failure [`Report`][amltk.optimization.Trial.Report]. If use pass - in an exception to `fail()`, it will be attached to the report along - with any traceback it can deduce. - Each [`Optimizer`][amltk.optimization.Optimizer] will take - care of what to do from here. - - ```python exec="true" source="material-block" html="true" - from amltk.optimization import Trial, Metric - from amltk.store import PathBucket - - cost = Metric("cost", minimize=True) - - def target_function(trial: Trial) -> Trial.Report: - x = trial.config["x"] - y = trial.config["y"] - - with trial.profile("expensive-calculation"): - cost = x**2 - y - - return trial.success(cost=cost) - - # ... usually obtained from an optimizer - trial = Trial.create( - name="some-unique-name", - config={"x": 1, "y": 2}, - metrics=[cost] - ) - - report = target_function(trial) - print(report.df()) - trial.bucket.rmdir() # markdown-exec: hide - ``` - - - What you can return with [`trial.success()`][amltk.optimization.Trial.success] - or [`trial.fail()`][amltk.optimization.Trial.fail] depends on the - [`metrics`][amltk.optimization.Trial.metrics] of the trial. Typically, - an optimizer will provide the trial with the list of metrics. - - ??? tip "Metrics" - - ::: amltk.optimization.metric.Metric - options: - members: False - - Some important properties are that they have a unique - [`.name`][amltk.optimization.Trial.name] given the optimization run, - a candidate [`.config`][amltk.optimization.Trial.config] to evaluate, - a possible [`.seed`][amltk.optimization.Trial.seed] to use, - and an [`.info`][amltk.optimization.Trial.info] object, which is the optimizer - specific information, if required by you. - - !!! tip "Reporting success (or failure)" - - When using the [`success()`][amltk.optimization.trial.Trial.success] - method, make sure to provide values for all metrics specified in the - [`.metrics`][amltk.optimization.Trial.metrics] attribute. - Usually these are set by the optimizer generating the `Trial`. - - If you instead report using [`fail()`][amltk.optimization.trial.Trial.success], - any metric not specified will be set to the - [`.worst`][amltk.optimization.Metric.worst] value of the metric. - - Each metric has a unique name, and it's crucial to use the correct names when - reporting success, otherwise an error will occur. - - ??? example "Reporting success for metrics" - - For example: - - ```python exec="true" result="python" source="material-block" - from amltk.optimization import Trial, Metric - - # Gotten from some optimizer usually, i.e. via `optimizer.ask()` - trial = Trial.create( - name="example_trial", - config={"param": 42}, - metrics=[Metric(name="accuracy", minimize=False)] - ) - - # Incorrect usage (will raise an error) - try: - report = trial.success(invalid_metric=0.95) - except ValueError as error: - print(error) - - # Correct usage - report = trial.success(accuracy=0.95) - trial.bucket.rmdir() # markdown-exec: hide - ``` - - If using [`Plugins`][amltk.scheduling.plugins.Plugin], they may insert - some extra objects in the [`.extra`][amltk.optimization.Trial.extras] dict. - - To profile your trial, you can wrap the logic you'd like to check with - [`trial.profile()`][amltk.optimization.Trial.profile], which will automatically - profile the block of code for memory before and after as well as time taken. - - If you've [`profile()`][amltk.optimization.Trial.profile]'ed any intervals, - you can access them by name through - [`trial.profiles`][amltk.optimization.Trial.profiles]. - Please see the [`Profiler`][amltk.profiling.profiler.Profiler] - for more. - - ??? example "Profiling with a trial." - - ```python exec="true" source="material-block" result="python" title="profile" - from amltk.optimization import Trial - - trial = Trial.create(name="some-unique-name", config={}) - - # ... somewhere where you've begun your trial. - with trial.profile("some_interval"): - for work in range(100): - pass - - print(trial.profiler.df()) - trial.bucket.rmdir() # markdown-exec: hide - ``` - - You can also record anything you'd like into the - [`.summary`][amltk.optimization.Trial.summary], a plain `#!python dict` - or use [`trial.store()`][amltk.optimization.Trial.store] to store artifacts - related to the trial. - - ??? tip "What to put in `.summary`?" - - For large items, e.g. predictions or models, these are highly advised to - [`.store()`][amltk.optimization.Trial.store] to disk, especially if using - a `Task` for multiprocessing. - - Further, if serializing the report using the - [`report.df()`][amltk.optimization.Trial.Report.df], - returning a single row, - or a [`History`][amltk.optimization.History] - with [`history.df()`][amltk.optimization.History.df] for a dataframe consisting - of many of the reports, then you'd likely only want to store things - that are scalar and can be serialised to disk by a pandas DataFrame. - - """ + """The trial class.""" name: str """The unique name of the trial.""" @@ -772,45 +595,7 @@ def __rich__(self) -> Text: @dataclass class Report(RichRenderable, Generic[I2]): - """The [`Trial.Report`][amltk.optimization.Trial.Report] encapsulates - a [`Trial`][amltk.optimization.Trial], its status and any metrics/exceptions - that may have occured. - - Typically you will not create these yourself, but instead use - [`trial.success()`][amltk.optimization.Trial.success] or - [`trial.fail()`][amltk.optimization.Trial.fail] to generate them. - - ```python exec="true" source="material-block" result="python" - from amltk.optimization import Trial, Metric - - loss = Metric("loss", minimize=True) - - trial = Trial.create(name="trial", config={"x": 1}, metrics=[loss]) - - with trial.profile("fitting"): - # Do some work - # ... - report = trial.success(loss=1) - - print(report.df()) - trial.bucket.rmdir() # markdown-exec: hide - ``` - - These reports are used to report back metrics to an - [`Optimizer`][amltk.optimization.Optimizer] - with [`Optimizer.tell()`][amltk.optimization.Optimizer.tell] but can also be - stored for your own uses. - - You can access the original trial with the - [`.trial`][amltk.optimization.Trial.Report.trial] attribute, and the - [`Status`][amltk.optimization.Trial.Status] of the trial with the - [`.status`][amltk.optimization.Trial.Report.status] attribute. - - You may also want to check out the [`History`][amltk.optimization.History] class - for storing a collection of `Report`s, allowing for an easier time to convert - them to a dataframe or perform some common Hyperparameter optimization parsing - of metrics. - """ + """The report generated from a `Trial`.""" trial: Trial[I2] """The trial that was run.""" diff --git a/src/amltk/profiling/profiler.py b/src/amltk/profiling/profiler.py index 95b6af0e..c2cf0a77 100644 --- a/src/amltk/profiling/profiler.py +++ b/src/amltk/profiling/profiler.py @@ -1,80 +1,4 @@ -"""Whether for debugging, building an AutoML system or for optimization -purposes, we provide a powerful [`Profiler`][amltk.profiling.Profiler], -which can generate a [`Profile`][amltk.profiling.Profile] of different sections -of code. This is particularly useful with [`Trial`][amltk.optimization.Trial]s, -so much so that we attach one to every `Trial` made as -[`trial.profiler`][amltk.optimization.Trial.profiler]. - -When done profiling, you can export all generated profiles as a dataframe using -[`profiler.df()`][amltk.profiling.Profiler.df]. - -```python exec="true" result="python" source="material-block" -from amltk.profiling import Profiler -import numpy as np - -profiler = Profiler() - -with profiler("loading-data"): - X = np.random.rand(1000, 1000) - -with profiler("training-model"): - model = np.linalg.inv(X) - -with profiler("predicting"): - y = model @ X - -print(profiler.df()) -``` - -You'll find these profiles as keys in the [`Profiler`][amltk.profiling.Profiler], -e.g. `#! python profiler["loading-data"]`. - -This will measure both the time it took within the block but also -the memory consumed before and after the block finishes, allowing -you to get an estimate of the memory consumed. - - -??? tip "Memory, vms vs rms" - - While not entirely accurate, this should be enough for info - for most use cases. - - Given the main process uses 2GB of memory and the process - then spawns a new process in which you are profiling, as you - might do from a [`Task`][amltk.scheduling.Task]. In this new - process you use another 2GB on top of that, then: - - * The virtual memory size (**vms**) will show 4GB as the - new process will share the 2GB with the main process and - have it's own 2GB. - - * The resident set size (**rss**) will show 2GB as the - new process will only have 2GB of it's own memory. - - -If you need to profile some iterator, like a for loop, you can use -[`Profiler.each()`][amltk.profiling.Profiler.each] which will measure -the entire loop but also each individual iteration. This can be useful -for iterating batches of a deep-learning model, splits of a cross-validator -or really any loop with work you want to profile. - -```python exec="true" result="python" source="material-block" -from amltk.profiling import Profiler -import numpy as np - -profiler = Profiler() - -for i in profiler.each(range(3), name="for-loop"): - X = np.random.rand(1000, 1000) - -print(profiler.df()) -``` - -Lastly, to disable profiling without editing much code, -you can always use [`Profiler.disable()`][amltk.profiling.Profiler.disable] -and [`Profiler.enable()`][amltk.profiling.Profiler.enable] to toggle -profiling on and off. -""" +"""The profiler module provides classes for profiling code.""" from __future__ import annotations diff --git a/src/amltk/scheduling/events.py b/src/amltk/scheduling/events.py index ed721b38..13876e65 100644 --- a/src/amltk/scheduling/events.py +++ b/src/amltk/scheduling/events.py @@ -1,242 +1,4 @@ -"""One of the primary ways to respond to `@events` emitted -with by a [`Task`][amltk.scheduling.Task] -the [`Scheduler`][amltk.scheduling.Scheduler] -is through use of a **callback**. - -The reason for this is to enable an easier time for API's to utilize -multiprocessing and remote compute from the `Scheduler`, without having -to burden users with knowing the details of how to use multiprocessing. - -A callback subscribes to some event using a decorator but can also be done in -a functional style if preferred. The below example is based on the -event [`@scheduler.on_start`][amltk.scheduling.Scheduler.on_start] but -the same applies to all events. - -=== "Decorators" - - ```python exec="true" source="material-block" html="true" - from amltk.scheduling import Scheduler - - scheduler = Scheduler.with_processes(1) - - @scheduler.on_start - def print_hello() -> None: - print("hello") - - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler, fontsize="small") # markdown-exec: hide - ``` - -=== "Functional" - - ```python exec="true" source="material-block" html="true" - from amltk.scheduling import Scheduler - - scheduler = Scheduler.with_processes(1) - - def print_hello() -> None: - print("hello") - - scheduler.on_start(print_hello) - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler, fontsize="small") # markdown-exec: hide - ``` - -There are a number of ways to customize the behaviour of these callbacks, notably -to control how often they get called and when they get called. - -??? tip "Callback customization" - - - === "`on('event', repeat=...)`" - - This will cause the callback to be called `repeat` times successively. - This is most useful in combination with - [`@scheduler.on_start`][amltk.scheduling.Scheduler.on_start] to launch - a number of tasks at the start of the scheduler. - - ```python exec="true" source="material-block" html="true" hl_lines="11" - from amltk import Scheduler - - N_WORKERS = 2 - - def f(x: int) -> int: - return x * 2 - from amltk._doc import make_picklable; make_picklable(f) # markdown-exec: hide - - scheduler = Scheduler.with_processes(N_WORKERS) - task = scheduler.task(f) - - @scheduler.on_start(repeat=N_WORKERS) - def on_start(): - task.submit(1) - - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler, fontsize="small") # markdown-exec: hide - ``` - - === "`on('event', max_calls=...)`" - - Limit the number of times a callback can be called, after which, the callback - will be ignored. - - ```python exec="true" source="material-block" html="True" hl_lines="13" - from asyncio import Future - from amltk.scheduling import Scheduler - - scheduler = Scheduler.with_processes(2) - - def expensive_function(x: int) -> int: - return x ** 2 - from amltk._doc import make_picklable; make_picklable(expensive_function) # markdown-exec: hide - - @scheduler.on_start - def submit_calculations() -> None: - scheduler.submit(expensive_function, 2) - - @scheduler.on_future_result(max_calls=3) - def print_result(future, result) -> None: - scheduler.submit(expensive_function, 2) - - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide - ``` - - === "`on('event', when=...)`" - - A callable which takes no arguments and returns a `bool`. The callback - will only be called when the `when` callable returns `True`. - - Below is a rather contrived example, but it shows how we can use the - `when` parameter to control when the callback is called. - - ```python exec="true" source="material-block" html="True" hl_lines="8 12" - import random - from amltk.scheduling import Scheduler - - LOCALE = random.choice(["English", "German"]) - - scheduler = Scheduler.with_processes(1) - - @scheduler.on_start(when=lambda: LOCALE == "English") - def print_hello() -> None: - print("hello") - - @scheduler.on_start(when=lambda: LOCALE == "German") - def print_guten_tag() -> None: - print("guten tag") - - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide - ``` - - === "`on('event', every=...)`" - - Only call the callback every `every` times the event is emitted. This - includes the first time it's called. - - ```python exec="true" source="material-block" html="True" hl_lines="6" - from amltk.scheduling import Scheduler - - scheduler = Scheduler.with_processes(1) - - # Print "hello" only every 2 times the scheduler starts. - @scheduler.on_start(every=2) - def print_hello() -> None: - print("hello") - - # Run the scheduler 5 times - scheduler.run() - scheduler.run() - scheduler.run() - scheduler.run() - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide - ``` - -### Emitter, Subscribers and Events -This part of the documentation is not necessary to understand or use for AMLTK. People -wishing to build tools upon AMLTK may still find this a useful component to add to their -arsenal. - -The core of making this functionality work is the [`Emitter`][amltk.scheduling.events.Emitter]. -Its purpose is to have `@events` that can be emitted and subscribed to. Classes like the -[`Scheduler`][amltk.scheduling.Scheduler] and [`Task`][amltk.scheduling.Task] carry -around with them an `Emitter` to enable all of this functionality. - -Creating an `Emitter` is rather straight-forward, but we must also create -[`Events`][amltk.scheduling.events.Event] that people can subscribe to. - -```python -from amltk.scheduling import Emitter, Event -emitter = Emitter("my-emitter") - -event: Event[int] = Event("my-event") # (1)! - -@emitter.on(event) -def my_callback(x: int) -> None: - print(f"Got {x}!") - -emitter.emit(event, 42) # (2)! -``` - -1. The typing `#!python Event[int]` is used to indicate that the event will be emitting - an integer. This is not necessary, but it is useful for type-checking and - documentation. -2. The `#!python emitter.emit(event, 42)` is used to emit the event. This will call - all the callbacks registered for the event, i.e. `#!python my_callback()`. - -!!! warning "Independent Events" - - Given a single `Emitter` and a single instance of an `Event`, there is no way to - have different `@events` for callbacks. There are two options, both used extensively - in AMLTK. - - The first is to have different `Events` quite naturally, i.e. you distinguish - between different things that can happen. However, you often want to have different - objects emit the same `Event` but have different callbacks for each object. - - This makes most sense in the context of a `Task` the `Event` instances are shared as - class variables in the `Task` class, however a user likely want's to subscribe to - the `Event` for a specific instance of the `Task`. - - This is where the second option comes in, in which each object carries around its - own `Emitter` instance. This is how a user can subscribe to the same kind of `Event` - but individually for each `Task`. - - -However, to shield users from this and to create named access points for users to -subscribe to, we can use the [`Subscriber`][amltk.scheduling.events.Subscriber] class, -conveniently created by the [`Emitter.subscriber()`][amltk.scheduling.events.Emitter.subscriber] -method. - -```python -from amltk.scheduling import Emitter, Event -emitter = Emitter("my-emitter") - -class GPT: - - event: Event[str] = Event("my-event") - - def __init__(self) -> None: - self.on_answer: Subscriber[str] = emitter.subscriber(self.event) - - def ask(self, question: str) -> None: - emitter.emit(self.event, "hello world!") - -gpt = GPT() - -@gpt.on_answer -def print_answer(answer: str) -> None: - print(answer) - -gpt.ask("What is the conical way for an AI to greet someone?") -``` - -Typically these event based systems make little sense in a synchronous context, however -with the [`Scheduler`][amltk.scheduling.Scheduler] and [`Task`][amltk.scheduling.Task] -classes, they are used to enable a simple way to use multiprocessing and remote compute. -""" # noqa: E501 +"""THe event system in AMLTK.""" from __future__ import annotations import logging diff --git a/src/amltk/scheduling/plugins/comm.py b/src/amltk/scheduling/plugins/comm.py index 7453a7f3..b1ad73a3 100644 --- a/src/amltk/scheduling/plugins/comm.py +++ b/src/amltk/scheduling/plugins/comm.py @@ -315,7 +315,7 @@ def on_start(): task.submit() @task.on("comm-close") - def on_close(msg: Comm.msg): + def on_close(msg: Comm.Msg): print(f"Worker close with {msg}") scheduler.run() diff --git a/src/amltk/scheduling/plugins/limiter.py b/src/amltk/scheduling/plugins/limiter.py index c158ad34..4c7a3085 100644 --- a/src/amltk/scheduling/plugins/limiter.py +++ b/src/amltk/scheduling/plugins/limiter.py @@ -8,7 +8,7 @@ ??? tip "Usage" ```python exec="true" source="material-block" html="true" - from amltk.scheduling import Scheduler + from amltk.scheduling import Scheduler, Task from amltk.scheduling.plugins import Limiter def fn(x: int) -> int: @@ -77,7 +77,7 @@ class Limiter(Plugin): followed by the arguments and keyword arguments that were passed to the task. ```python exec="true" source="material-block" html="true" - from amltk.scheduling import Scheduler + from amltk.scheduling import Scheduler, Task from amltk.scheduling.plugins import Limiter def fn(x: int) -> int: @@ -101,7 +101,7 @@ def callback(task: Task, *args, **kwargs): arguments and keyword arguments that were passed to the task. ```python exec="true" source="material-block" html="true" - from amltk.scheduling import Scheduler + from amltk.scheduling import Scheduler, Task from amltk.scheduling.plugins import Limiter def fn(x: int) -> int: @@ -128,7 +128,7 @@ def callback(task: Task, *args, **kwargs): the arguments and keyword arguments that were passed to the task. ```python exec="true" source="material-block" html="true" - from amltk.scheduling import Scheduler + from amltk.scheduling import Scheduler, Task from amltk.scheduling.plugins import Limiter def fn(x: int) -> int: diff --git a/src/amltk/scheduling/queue_monitor.py b/src/amltk/scheduling/queue_monitor.py index f7522b7a..d8b09df9 100644 --- a/src/amltk/scheduling/queue_monitor.py +++ b/src/amltk/scheduling/queue_monitor.py @@ -1,68 +1,4 @@ -"""A [`QueueMonitor`][amltk.scheduling.queue_monitor.QueueMonitor] is a -monitor for the scheduler queue. - -This module contains a monitor for the scheduler queue. The monitor tracks the -queue state at every event emitted by the scheduler. The data can be converted -to a pandas DataFrame or plotted as a stacked barchart. - -!!! note "Monitoring Frequency" - - To prevent repeated polling, we sample the scheduler queue at every scheduler event. - This is because the queue is only modified upon one of these events. This means we - don't need to poll the queue at a fixed interval. However, if you need more fine - grained updates, you can add extra events/timings at which the monitor should - [`update()`][amltk.scheduling.queue_monitor.QueueMonitor.update]. - -!!! warning "Performance impact" - - If your tasks and callbacks are very fast (~sub 10ms), then the monitor has a - non-nelgible impact however for most use cases, this should not be a problem. - As anything, you should profile how much work the scheduler can get done, - with and without the monitor, to see if it is a problem for your use case. - -In the below example, we have a very fast running function that runs on repeat, -sometimes too fast for the scheduler to keep up, letting some futures buildup needing -to be processed. - -```python exec="true" source="material-block" result="python" session="queue-monitor" -import time -import matplotlib.pyplot as plt -from amltk.scheduling import Scheduler -from amltk.scheduling.queue_monitor import QueueMonitor - -def fast_function(x: int) -> int: - return x + 1 -from amltk._doc import make_picklable; make_picklable(fast_function) # markdown-exec: hide - -N_WORKERS = 2 -scheduler = Scheduler.with_processes(N_WORKERS) -monitor = QueueMonitor(scheduler) -task = scheduler.task(fast_function) - -@scheduler.on_start(repeat=N_WORKERS) -def start(): - task.submit(1) - -@task.on_result -def result(_, x: int): - if scheduler.running(): - task.submit(x) - -scheduler.run(timeout=1) -df = monitor.df() -print(df) -``` - -We can also [`plot()`][amltk.scheduling.queue_monitor.QueueMonitor.plot] the data as a -stacked barchart with a set interval. - -```python exec="true" source="material-block" html="true" session="queue-monitor" -fig, ax = plt.subplots() -monitor.plot(interval=(50, "ms")) -from io import StringIO; fig.tight_layout(); buffer = StringIO(); plt.savefig(buffer, format="svg"); print(buffer.getvalue()) # markdown-exec: hide -``` - -""" # noqa: E501 +"""The queue monitoring.""" from __future__ import annotations import time diff --git a/src/amltk/scheduling/scheduler.py b/src/amltk/scheduling/scheduler.py index 845f6e88..f684c4d9 100644 --- a/src/amltk/scheduling/scheduler.py +++ b/src/amltk/scheduling/scheduler.py @@ -1,253 +1,4 @@ -"""The [`Scheduler`][amltk.scheduling.Scheduler] uses -an [`Executor`][concurrent.futures.Executor], a builtin python native with -a `#!python submit(f, *args, **kwargs)` function to submit compute to -be compute else where, whether it be locally or remotely. - -The `Scheduler` is primarily used to dispatch compute to an `Executor` and -emit `@events`, which can trigger user callbacks. - -Typically you should not use the `Scheduler` directly for dispatching and -responding to computed functions, but rather use a [`Task`][amltk.scheduling.Task] - -??? note "Running in a Jupyter Notebook/Colab" - - If you are using a Jupyter Notebook, you likley need to use the following - at the top of your notebook: - - ```python - import nest_asyncio # Only necessary in Notebooks - nest_asyncio.apply() - - scheduler.run(...) - ``` - - This is due to the fact a notebook runs in an async context. If you do not - wish to use the above snippet, you can instead use: - - ```python - await scheduler.async_run(...) - ``` - -??? tip "Basic Usage" - - In this example, we create a scheduler that uses local processes as - workers. We then create a task that will run a function `fn` and submit it - to the scheduler. Lastly, a callback is registered to `@future-result` to print the - result when the compute is done. - - ```python exec="true" source="material-block" html="true" - from amltk.scheduling import Scheduler - - def fn(x: int) -> int: - return x + 1 - from amltk._doc import make_picklable; make_picklable(fn) # markdown-exec: hide - - scheduler = Scheduler.with_processes(1) - - @scheduler.on_start - def launch_the_compute(): - scheduler.submit(fn, 1) - - @scheduler.on_future_result - def callback(future, result): - print(f"Result: {result}") - - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler) # markdown-exec: hide - ``` - - The last line in the previous example called - [`scheduler.run()`][amltk.scheduling.Scheduler.run] is what starts the scheduler - running, in which it will first emit the `@start` event. This triggered the - callback `launch_the_compute()` which submitted the function `fn` with the - arguments `#!python 1`. - - The scheduler then ran the compute and waited for it to complete, emitting the - `@future-result` event when it was done successfully. This triggered the callback - `callback()` which printed the result. - - At this point, there is no more compute happening and no more events to respond to - so the scheduler will halt. - -??? example "`@events`" - - === "Scheduler Status Events" - - When the scheduler enters some important state, it will emit an event - to let you know. - - === "`@start`" - - ::: amltk.scheduling.Scheduler.on_start - - === "`@finishing`" - - ::: amltk.scheduling.Scheduler.on_finishing - - === "`@finished`" - - ::: amltk.scheduling.Scheduler.on_finished - - === "`@stop`" - - ::: amltk.scheduling.Scheduler.on_stop - - === "`@timeout`" - - ::: amltk.scheduling.Scheduler.on_timeout - - === "`@empty`" - - ::: amltk.scheduling.Scheduler.on_empty - - === "Submitted Compute Events" - - When any compute goes through the `Scheduler`, it will emit an event - to let you know. You should however prefer to use a - [`Task`][amltk.scheduling.Task] as it will emit specific events - for the task at hand, and not all compute. - - === "`@future-submitted`" - - ::: amltk.scheduling.Scheduler.on_future_submitted - - === "`@future-result`" - - ::: amltk.scheduling.Scheduler.on_future_result - - === "`@future-exception`" - - ::: amltk.scheduling.Scheduler.on_future_exception - - === "`@future-done`" - - ::: amltk.scheduling.Scheduler.on_future_done - - === "`@future-cancelled`" - - ::: amltk.scheduling.Scheduler.on_future_cancelled - - -??? tip "Common usages of `run()`" - - There are various ways to [`run()`][amltk.scheduling.Scheduler.run] the - scheduler, notably how long it should run with `timeout=` and also how - it should react to any exception that may have occurred within the `Scheduler` - itself or your callbacks. - - Please see the [`run()`][amltk.scheduling.Scheduler.run] API doc for more - details and features, however we show two common use cases of using the `timeout=` - parameter. - - You can render a live display using [`run(display=...)`][amltk.scheduling.Scheduler.run]. - This require [`rich`](https://github.com/Textualize/rich) to be installed. You - can install this with `#!bash pip install rich` or `#!bash pip install amltk[rich]`. - - - === "`run(timeout=...)`" - - You can tell the `Scheduler` to stop after a certain amount of time - with the `timeout=` argument to [`run()`][amltk.scheduling.Scheduler.run]. - - This will also trigger the `@timeout` event as seen in the `Scheduler` output. - - ```python exec="true" source="material-block" html="True" hl_lines="19" - import time - from amltk.scheduling import Scheduler - - scheduler = Scheduler.with_processes(1) - - def expensive_function() -> int: - time.sleep(0.1) - return 42 - from amltk._doc import make_picklable; make_picklable(expensive_function) # markdown-exec: hide - - @scheduler.on_start - def submit_calculations() -> None: - scheduler.submit(expensive_function) - - # The will endlessly loop the scheduler - @scheduler.on_future_done - def submit_again(future: Future) -> None: - if scheduler.running(): - scheduler.submit(expensive_function) - - scheduler.run(timeout=1) # End after 1 second - from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide - ``` - - === "`run(timeout=..., wait=False)`" - - By specifying that the `Scheduler` should not wait for ongoing tasks - to finish, the `Scheduler` will attempt to cancel and possibly terminate - any running tasks. - - ```python exec="true" source="material-block" html="True" - import time - from amltk.scheduling import Scheduler - - scheduler = Scheduler.with_processes(1) - - def expensive_function() -> None: - time.sleep(10) - - from amltk._doc import make_picklable; make_picklable(expensive_function) # markdown-exec: hide - - @scheduler.on_start - def submit_calculations() -> None: - scheduler.submit(expensive_function) - - scheduler.run(timeout=1, wait=False) # End after 1 second - from amltk._doc import doc_print; doc_print(print, scheduler, output="html", fontsize="small") # markdown-exec: hide - ``` - - ??? info "Forcibly Terminating Workers" - - As an `Executor` does not provide an interface to forcibly - terminate workers, we provide `Scheduler(terminate=...)` as a custom - strategy for cleaning up a provided executor. It is not possible - to terminate running thread based workers, for example using - `ThreadPoolExecutor` and any Executor using threads to spawn - tasks will have to wait until all running tasks are finish - before python can close. - - It's likely `terminate` will trigger the `EXCEPTION` event for - any tasks that are running during the shutdown, **not*** - a cancelled event. This is because we use a - [`Future`][concurrent.futures.Future] - under the hood and these can not be cancelled once running. - However there is no guarantee of this and is up to how the - `Executor` handles this. - -??? example "Scheduling something to be run later" - - You can schedule some function to be run later using the - [`#!python scheduler.call_later()`][amltk.scheduling.Scheduler.call_later] method. - - !!! note - - This does not run the function in the background, it just schedules some - function to be called later, where you could perhaps then use submit to - scheduler a [`Task`][amltk.scheduling.Task] to run the function in the - background. - - ```python exec="true" source="material-block" result="python" - from amltk.scheduling import Scheduler - - scheduler = Scheduler.with_processes(1) - - def fn() -> int: - print("Ending now!") - scheduler.stop() - - @scheduler.on_start - def schedule_fn() -> None: - scheduler.call_later(1, fn) - - scheduler.run(end_on_empty=False) - ``` - -""" # noqa: E501 +"""The scheduler for AMLTK.""" from __future__ import annotations import asyncio diff --git a/src/amltk/scheduling/task.py b/src/amltk/scheduling/task.py index f9f216fd..2e8cc7c4 100644 --- a/src/amltk/scheduling/task.py +++ b/src/amltk/scheduling/task.py @@ -1,75 +1,4 @@ -"""A [`Task`][amltk.scheduling.task.Task] is a unit of work that can be scheduled by the -[`Scheduler`][amltk.scheduling.Scheduler]. - -It is defined by its `function=` to call. Whenever a `Task` -has its [`submit()`][amltk.scheduling.task.Task.submit] method called, -the function will be dispatched to run by a `Scheduler`. - -When a task has returned, either successfully, or with an exception, -it will emit `@events` to indicate so. You can subscribe to these events -with callbacks and act accordingly. - - -??? example "`@events`" - - Check out the `@events` reference - for more on how to customize these callbacks. You can also take a look - at the API of [`on()`][amltk.scheduling.task.Task.on] for more information. - - === "`@on-result`" - - ::: amltk.scheduling.task.Task.on_result - - === "`@on-exception`" - - ::: amltk.scheduling.task.Task.on_exception - - === "`@on-done`" - - ::: amltk.scheduling.task.Task.on_done - - === "`@on-submitted`" - - ::: amltk.scheduling.task.Task.on_submitted - - === "`@on-cancelled`" - - ::: amltk.scheduling.task.Task.on_cancelled - -??? tip "Usage" - - The usual way to create a task is with - [`Scheduler.task()`][amltk.scheduling.scheduler.Scheduler.task], - where you provide the `function=` to call. - - ```python exec="true" source="material-block" html="true" - from amltk import Scheduler - - def f(x: int) -> int: - return x * 2 - from amltk._doc import make_picklable; make_picklable(f) # markdown-exec: hide - - scheduler = Scheduler.with_processes(2) - task = scheduler.task(f) - - @scheduler.on_start - def on_start(): - task.submit(1) - - @task.on_result - def on_result(future: Future[int], result: int): - print(f"Task {future} returned {result}") - - scheduler.run() - from amltk._doc import doc_print; doc_print(print, scheduler) # markdown-exec: hide - ``` - - If you'd like to simply just call the original function, without submitting it to - the scheduler, you can always just call the task directly, i.e. `#!python task(1)`. - -You can also provide [`Plugins`][amltk.scheduling.plugins.Plugin] to the task, -to modify tasks, add functionality and add new events. -""" +"""The task module.""" from __future__ import annotations import logging diff --git a/src/amltk/sklearn/evaluation.py b/src/amltk/sklearn/evaluation.py index f31d3db5..8d9d3bd3 100644 --- a/src/amltk/sklearn/evaluation.py +++ b/src/amltk/sklearn/evaluation.py @@ -1775,7 +1775,7 @@ def cv_early_stopping_plugin( from amltk.sklearn import CVEvaluation from amltk.pipeline import Component - from amltk.optimization import Metric + from amltk.optimization import Metric, Trial working_dir = Path("./some-path") pipeline = Component(DecisionTreeClassifier, space={"max_depth": (1, 10)})