From cf718baed9cf45a4bf7fc32a51e744c9685d1f05 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 07:49:34 +0100 Subject: [PATCH 01/32] Remove unnecessary examples --- examples/Backtesting/botorch_analytical.py | 121 -- .../Backtesting/botorch_analytical_dark.svg | 1344 ----------------- .../Backtesting/botorch_analytical_light.svg | 1344 ----------------- .../continuous_space_botorch_function.py | 94 -- .../continuous_space_custom_function.py | 85 -- 5 files changed, 2988 deletions(-) delete mode 100644 examples/Backtesting/botorch_analytical.py delete mode 100644 examples/Backtesting/botorch_analytical_dark.svg delete mode 100644 examples/Backtesting/botorch_analytical_light.svg delete mode 100644 examples/Searchspaces/continuous_space_botorch_function.py delete mode 100644 examples/Searchspaces/continuous_space_custom_function.py diff --git a/examples/Backtesting/botorch_analytical.py b/examples/Backtesting/botorch_analytical.py deleted file mode 100644 index d0874eef5..000000000 --- a/examples/Backtesting/botorch_analytical.py +++ /dev/null @@ -1,121 +0,0 @@ -## Simulation loop using a BoTorch test function - -# This example shows a simulation loop for a single target with a BoTorch test function as lookup. - -# This example assumes some basic familiarity with using BayBE and how to use BoTorch test -# functions in discrete searchspaces. -# We thus refer to -# 1. [`campaign`](./../Basics/campaign.md) for a basic example on how to use BayBE and -# 2. [`discrete_space`](./../Searchspaces/discrete_space.md) for details on using a -# BoTorch test function. - -### Imports - -import os - -import numpy as np -import seaborn as sns -from botorch.test_functions import Rastrigin - -from baybe import Campaign -from baybe.objectives import SingleTargetObjective -from baybe.parameters import NumericalDiscreteParameter -from baybe.recommenders import RandomRecommender -from baybe.searchspace import SearchSpace -from baybe.simulation import simulate_scenarios -from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper -from baybe.utils.plotting import create_example_plots - -### Parameters for a full simulation loop - -# For the full simulation, we need to define the number of Monte Carlo runs -# and the number of experiments to be conducted per run. - -SMOKE_TEST = "SMOKE_TEST" in os.environ - -N_MC_ITERATIONS = 2 if SMOKE_TEST else 30 -N_DOE_ITERATIONS = 2 if SMOKE_TEST else 15 -BATCH_SIZE = 1 if SMOKE_TEST else 3 -POINTS_PER_DIM = 10 - -### Defining the test function - -# See [`discrete_space`](./../Searchspaces/discrete_space.md) for details. - -DIMENSION = 4 -TestFunctionClass = Rastrigin - -if not hasattr(TestFunctionClass, "dim"): - TestFunction = TestFunctionClass(dim=DIMENSION) -else: - print( - f"\nYou choose a dimension of {DIMENSION} for the test function" - f"{TestFunctionClass}. However, this function can only be used in " - f"{TestFunctionClass().dim} dimension, so the provided dimension is replaced." - ) - TestFunction = TestFunctionClass() - DIMENSION = TestFunctionClass().dim - -BOUNDS = TestFunction.bounds -WRAPPED_FUNCTION = botorch_function_wrapper(test_function=TestFunction) - -### Creating the searchspace and the objective - -parameters = [ - NumericalDiscreteParameter( - name=f"x_{k+1}", - values=list( - np.linspace( - BOUNDS[0, k], - BOUNDS[1, k], - POINTS_PER_DIM, - ) - ), - tolerance=0.01, - ) - for k in range(DIMENSION) -] - -searchspace = SearchSpace.from_product(parameters=parameters) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) - -### Constructing campaigns - -seq_greedy_EI_campaign = Campaign( - searchspace=searchspace, - objective=objective, -) -random_campaign = Campaign( - searchspace=searchspace, - recommender=RandomRecommender(), - objective=objective, -) - -### Performing the simulation loop - -# We use [simulate_scenarios](baybe.simulation.scenarios.simulate_scenarios) to simulate a full experiment. - -scenarios = { - "Sequential greedy EI": seq_greedy_EI_campaign, - "Random": random_campaign, -} -results = simulate_scenarios( - scenarios, - WRAPPED_FUNCTION, - batch_size=BATCH_SIZE, - n_doe_iterations=N_DOE_ITERATIONS, - n_mc_iterations=N_MC_ITERATIONS, -) - -# We use the plotting utility to create plots. - -ax = sns.lineplot( - data=results, - marker="o", - markersize=10, - x="Num_Experiments", - y="Target_CumBest", - hue="Scenario", -) -create_example_plots(ax=ax, base_name="botorch_analytical") diff --git a/examples/Backtesting/botorch_analytical_dark.svg b/examples/Backtesting/botorch_analytical_dark.svg deleted file mode 100644 index 76e38d76d..000000000 --- a/examples/Backtesting/botorch_analytical_dark.svg +++ /dev/null @@ -1,1344 +0,0 @@ - - - - - - - - 2024-08-02T18:50:09.281391 - image/svg+xml - - - Matplotlib v3.9.1, https://matplotlib.org/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/examples/Backtesting/botorch_analytical_light.svg b/examples/Backtesting/botorch_analytical_light.svg deleted file mode 100644 index 9ba6a7739..000000000 --- a/examples/Backtesting/botorch_analytical_light.svg +++ /dev/null @@ -1,1344 +0,0 @@ - - - - - - - - 2024-08-02T18:50:09.298960 - image/svg+xml - - - Matplotlib v3.9.1, https://matplotlib.org/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/examples/Searchspaces/continuous_space_botorch_function.py b/examples/Searchspaces/continuous_space_botorch_function.py deleted file mode 100644 index aa7bc7246..000000000 --- a/examples/Searchspaces/continuous_space_botorch_function.py +++ /dev/null @@ -1,94 +0,0 @@ -## Example for using a synthetic BoTorch test function in a continuous searchspace - -# Example for using the synthetic test functions in a continuous spaces. -# All test functions that are available in BoTorch are also available here and wrapped -# via the `botorch_function_wrapper`. - -# This example assumes some basic familiarity with using BayBE. -# We thus refer to [`campaign`](./../Basics/campaign.md) for a basic example. -# Also, there is a large overlap with other examples with regards to using the test function. -# We thus refer to [`discrete_space`](./discrete_space.md) for details on this aspect. - - -### Necessary imports for this example - -from botorch.test_functions import Rastrigin - -from baybe import Campaign -from baybe.objectives import SingleTargetObjective -from baybe.parameters import NumericalContinuousParameter -from baybe.searchspace import SearchSpace -from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper - -### Defining the test function - -# See [`discrete_space`](./../Searchspaces/discrete_space.md) for details. - -DIMENSION = 4 -TestFunctionClass = Rastrigin - -if not hasattr(TestFunctionClass, "dim"): - TestFunction = TestFunctionClass(dim=DIMENSION) -elif TestFunctionClass().dim == DIMENSION: - TestFunction = TestFunctionClass() -else: - print( - f"\nYou choose a dimension of {DIMENSION} for the test function" - f"{TestFunctionClass}. However, this function can only be used in " - f"{TestFunctionClass().dim} dimension, so the provided dimension is replaced. " - "Also, DISC_INDICES and CONT_INDICES will be re-written." - ) - TestFunction = TestFunctionClass() - DIMENSION = TestFunctionClass().dim - DISC_INDICES = list(range(0, (DIMENSION + 1) // 2)) - CONT_INDICES = list(range((DIMENSION + 1) // 2, DIMENSION)) - -BOUNDS = TestFunction.bounds -WRAPPED_FUNCTION = botorch_function_wrapper(test_function=TestFunction) - -### Creating the searchspace and the objective - -# Since the searchspace is continuous, we use `NumericalContinuousParameter`s. -# We use the data of the test function to deduce bounds and number of parameters. - -parameters = [ - NumericalContinuousParameter( - name=f"x_{k+1}", - bounds=(BOUNDS[0, k], BOUNDS[1, k]), - ) - for k in range(DIMENSION) -] - -searchspace = SearchSpace.from_product(parameters=parameters) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) - -### Constructing the campaign and performing a recommendation - -campaign = Campaign( - searchspace=searchspace, - objective=objective, -) - -# Get a recommendation for a fixed batch size. - -BATCH_SIZE = 3 -recommendation = campaign.recommend(batch_size=BATCH_SIZE) - -# Evaluate the test function. -# Note that we need iterate through the rows of the recommendation. -# Furthermore, we need to interpret the row as a list. - -target_values = [] -for index, row in recommendation.iterrows(): - target_values.append(WRAPPED_FUNCTION(*row.to_list())) - -# We add an additional column with the calculated target values. - -recommendation["Target"] = target_values - -# Here, we inform the campaign about our measurement. - -campaign.add_measurements(recommendation) -print("\n\nRecommended experiments with measured values: ") -print(recommendation) diff --git a/examples/Searchspaces/continuous_space_custom_function.py b/examples/Searchspaces/continuous_space_custom_function.py deleted file mode 100644 index 963919af7..000000000 --- a/examples/Searchspaces/continuous_space_custom_function.py +++ /dev/null @@ -1,85 +0,0 @@ -## Example for using a custom BoTorch test function in a continuous searchspace - -# This example shows how an arbitrary python function can be used as lookup. - -# This example assumes some basic familiarity with using BayBE. -# We thus refer to [`campaign`](./../Basics/campaign.md) for a basic example. - -### Necessary imports - -from baybe import Campaign -from baybe.objectives import SingleTargetObjective -from baybe.parameters import NumericalContinuousParameter -from baybe.searchspace import SearchSpace -from baybe.targets import NumericalTarget - -### Defining the custom test function - -# The function should accept an arbitrary or fixed amount of floats as input. -# It needs to return either a single float or a tuple of floats. -# It is assumed that the analytical test function does only perform a single calculation. -# That is, it is assumed to work in a non-batched-way! - -# In this example, we implement a simple sum of squares function with a single output. - - -def sum_of_squares(*x: float) -> float: - """Calculate the sum of squares.""" - res = 0 - for y in x: - res += y**2 - return res - - -TEST_FUNCTION = sum_of_squares - -# For our actual experiment, we need to specify the number of dimension that we want to use. -# This is necessary to know for the creation of the parameters. -# Similarly, it is necessary to state the bounds of the parameters. -# These should be provided as a list of two-dimensional tuples. - -DIMENSION = 4 -BOUNDS = [(-2, 2), (-2, 2), (-2, 2), (-2, 2)] - -### Creating the searchspace and the objective - -parameters = [ - NumericalContinuousParameter( - name=f"x_{k+1}", - bounds=BOUNDS[k], - ) - for k in range(DIMENSION) -] - -searchspace = SearchSpace.from_product(parameters=parameters) - -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) - -### Constructing the campaign and performing a recommendation - -campaign = Campaign( - searchspace=searchspace, - objective=objective, -) - -# Get a recommendation for a fixed batch size. -BATCH_SIZE = 3 -recommendation = campaign.recommend(batch_size=BATCH_SIZE) - -# Evaluate the test function. -# Note that we need iterate through the rows of the recommendation. -# Furthermore, we need to interpret the row as a list. - -target_values = [] -for index, row in recommendation.iterrows(): - target_values.append(TEST_FUNCTION(*row.to_list())) - -# We add an additional column with the calculated target values. - -recommendation["Target"] = target_values - -# Here, we inform the campaign about our measurement. - -campaign.add_measurements(recommendation) -print("\n\nRecommended experiments with measured values: ") -print(recommendation) From fd4cf9dd19bacd2a803be90b9e382da3f3b5bf20 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 11:28:29 +0100 Subject: [PATCH 02/32] Handle edge case of empty filter in dataframe utility --- baybe/utils/dataframe.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index d1037b6d0..96e0e1dea 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -643,7 +643,24 @@ def filter_df( num cat 2 1 a 3 1 b + + >>> filter_df(df, pd.DataFrame(), complement=True) + num cat + 0 0 a + 1 0 b + 2 1 a + 3 1 b + + >>> filter_df(df, pd.DataFrame(), complement=False) + Empty DataFrame + Columns: [num, cat] + Index: [] + """ + # Handle special case of empty filter + if filter.empty: + return df if complement else pd.DataFrame(columns=df.columns) + # Remember original index name index_name = df.index.name From 6033d8b2131b4bd9ed1fdc9ff628d1b0080b0e90 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 11:36:45 +0100 Subject: [PATCH 03/32] Refactor callable lookup mechanism The callable now operates with dataframe input/output, replacing the positional indexing approach with a clean label-based approach using parameter and target names. --- baybe/simulation/core.py | 4 +--- baybe/simulation/lookup.py | 33 ++++++--------------------------- baybe/simulation/scenarios.py | 4 ++-- 3 files changed, 9 insertions(+), 32 deletions(-) diff --git a/baybe/simulation/core.py b/baybe/simulation/core.py index 93033a1f5..b28c8473a 100644 --- a/baybe/simulation/core.py +++ b/baybe/simulation/core.py @@ -23,7 +23,7 @@ def simulate_experiment( campaign: Campaign, - lookup: pd.DataFrame | Callable | None = None, + lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None, /, *, batch_size: int = 1, @@ -85,8 +85,6 @@ def simulate_experiment( * for each target a column ``{targetname}_Measurements``: The individual measurements obtained for the respective target and iteration """ - # TODO: Due to the "..." operator, sphinx does not render this properly. Might - # want to investigate in the future. # TODO: Use a `will_terminate` campaign property to decide if the campaign will # run indefinitely or not, and allow omitting `n_doe_iterations` for the latter. if campaign.objective is None: diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py index f96c68e62..d58c9aa05 100644 --- a/baybe/simulation/lookup.py +++ b/baybe/simulation/lookup.py @@ -19,7 +19,7 @@ def look_up_targets( queries: pd.DataFrame, targets: Collection[Target], - lookup: pd.DataFrame | Callable | None, + lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None, impute_mode: Literal[ "error", "worst", "best", "mean", "random", "ignore" ] = "error", @@ -70,7 +70,7 @@ def look_up_targets( if lookup is None: add_fake_measurements(queries, targets) elif isinstance(lookup, Callable): - _look_up_targets_from_callable(queries, targets, lookup) + _look_up_targets_from_callable(queries, lookup) elif isinstance(lookup, pd.DataFrame): _look_up_targets_from_dataframe(queries, targets, lookup, impute_mode) else: @@ -78,33 +78,12 @@ def look_up_targets( def _look_up_targets_from_callable( - queries: pd.DataFrame, - targets: Collection[Target], - lookup: Callable, + queries: pd.DataFrame, lookup: Callable[[pd.DataFrame], pd.DataFrame] ) -> None: """Look up target values by querying a callable.""" - # TODO: Currently, the alignment of return values to targets is based on the - # column ordering, which is not robust. Instead, the callable should return - # a dataframe with properly labeled columns. - - # Since the return of a lookup function is a tuple, the following code stores - # tuples of floats in a single column with label 0: - measured_targets = queries.apply(lambda x: lookup(*x.values), axis=1).to_frame() - # We transform this column to a DataFrame in which there is an individual - # column for each of the targets.... - split_target_columns = pd.DataFrame( - measured_targets[0].to_list(), index=measured_targets.index - ) - # ... and assign this to measured_targets in order to have one column per target - measured_targets[split_target_columns.columns] = split_target_columns - if measured_targets.shape[1] != len(targets): - raise AssertionError( - "If you use an analytical function as lookup, make sure " - "the configuration has the right amount of targets " - "specified." - ) - for k_target, target in enumerate(targets): - queries[target.name] = measured_targets.iloc[:, k_target] + df_targets = lookup(queries) + for col in df_targets: + queries[col] = df_targets[col] def _look_up_targets_from_dataframe( diff --git a/baybe/simulation/scenarios.py b/baybe/simulation/scenarios.py index 2243dfafd..f332e6909 100644 --- a/baybe/simulation/scenarios.py +++ b/baybe/simulation/scenarios.py @@ -22,7 +22,7 @@ def simulate_scenarios( scenarios: dict[Any, Campaign], - lookup: pd.DataFrame | Callable | None = None, + lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None, /, *, batch_size: int = 1, @@ -161,7 +161,7 @@ def unpack_simulation_results(array: DataArray) -> pd.DataFrame: def _simulate_groupby( campaign: Campaign, - lookup: pd.DataFrame | Callable[..., tuple[float, ...]] | None = None, + lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None = None, /, *, batch_size: int = 1, From 05e63eff62b3914da95aaedb30d9a1c9271594ca Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 12:43:10 +0100 Subject: [PATCH 04/32] Implement decorator utility for creating lookup callables --- baybe/simulation/__init__.py | 2 ++ baybe/simulation/lookup.py | 41 +++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/baybe/simulation/__init__.py b/baybe/simulation/__init__.py index bfde8efc5..e27df1999 100644 --- a/baybe/simulation/__init__.py +++ b/baybe/simulation/__init__.py @@ -16,10 +16,12 @@ """ from baybe.simulation.core import simulate_experiment +from baybe.simulation.lookup import label_columns from baybe.simulation.scenarios import simulate_scenarios from baybe.simulation.transfer_learning import simulate_transfer_learning __all__ = [ + "label_columns", "simulate_experiment", "simulate_scenarios", "simulate_transfer_learning", diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py index d58c9aa05..10aa2b0e6 100644 --- a/baybe/simulation/lookup.py +++ b/baybe/simulation/lookup.py @@ -2,8 +2,9 @@ from __future__ import annotations +import functools import logging -from collections.abc import Callable, Collection +from collections.abc import Callable, Collection, Sequence from typing import Literal import numpy as np @@ -138,3 +139,41 @@ def _look_up_targets_from_dataframe( # Add the lookup values queries.loc[:, target_names] = np.asarray(all_match_vals) + + +def label_columns( + input_labels: Sequence[str], output_labels: Sequence[str] +) -> Callable: + """Create a decorator for labeling the inputs and outputs of array-based callables. + + The decorator transforms a callable designed to work with unlabelled arrays such + that it can operate with dataframes instead. The original callable is expected to + accept and return two-dimensional arrays. When decorated, the callable accepts and + returns dataframes whose columns are mapped to the corresponding arrays based on the + specified label sequences. + + Args: + input_labels: The sequence of input labels mapping the columns of the input + dataframe to columns of the input array in the specified order. + output_labels: The sequence of output labels mapping the columns of the output + dataframe to columns of the output array in the specified order. + + Returns: + The decorator for the given input and output labels. + """ + + def decorator( + fn: Callable[[np.ndarray], np.ndarray], + ) -> Callable[[pd.DataFrame], pd.DataFrame]: + """Turn an array-based callable into a dataframe-based callable.""" + + @functools.wraps(fn) + def wrapper(df: pd.DataFrame, /) -> pd.DataFrame: + """Translate to/from an array-based callable using dataframes.""" + array_in = df[input_labels].to_numpy() + array_out = fn(array_in) + return pd.DataFrame(array_out, columns=output_labels, index=df.index) + + return wrapper + + return decorator From c05a71ffa0ffbc85ddc3ce5a412adfccc5c0c596 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 12:45:10 +0100 Subject: [PATCH 05/32] Refactor custom analytical backtesting example --- examples/Backtesting/custom_analytical.py | 136 +- .../Backtesting/custom_analytical_dark.svg | 1434 +++++++++++++++++ .../Backtesting/custom_analytical_light.svg | 1434 +++++++++++++++++ 3 files changed, 2929 insertions(+), 75 deletions(-) create mode 100644 examples/Backtesting/custom_analytical_dark.svg create mode 100644 examples/Backtesting/custom_analytical_light.svg diff --git a/examples/Backtesting/custom_analytical.py b/examples/Backtesting/custom_analytical.py index c07bff2e8..1bef726b3 100644 --- a/examples/Backtesting/custom_analytical.py +++ b/examples/Backtesting/custom_analytical.py @@ -1,124 +1,110 @@ -## Example for full simulation loop using a custom analytical test function +## Optimizing a Custom Black-Box Function -# This example shows a simulation loop for a single target with a custom test function as lookup. -# That is, we perform several Monte Carlo runs with several iterations. -# In addition, we also store and display the results. +# This example demonstrates how to optimize a custom black-box function: +# * We create a black-box callable and define the corresponding optimization scope, +# * set up optimization strategies, +# * and compare the resulting trajectories. -# This example assumes some basic familiarity with using BayBE and how to use BoTorch test -# functions in discrete searchspaces. -# For further details, we thus refer to -# - [`campaign`](./../Basics/campaign.md) for a basic example on how to use BayBE and -# - [here](./../Searchspaces/continuous_space_custom_function.md) for how to use a custom function. - -### Necessary imports for this example +### Imports import os -import matplotlib.pyplot as plt import numpy as np import seaborn as sns from baybe import Campaign -from baybe.objectives import SingleTargetObjective -from baybe.parameters import NumericalDiscreteParameter -from baybe.recommenders import ( - BotorchRecommender, - RandomRecommender, - TwoPhaseMetaRecommender, -) +from baybe.parameters.numerical import NumericalContinuousParameter +from baybe.recommenders import RandomRecommender from baybe.searchspace import SearchSpace -from baybe.simulation import simulate_scenarios +from baybe.simulation import label_columns, simulate_scenarios from baybe.targets import NumericalTarget +from baybe.utils.plotting import create_example_plots -### Parameters for a full simulation loop +### Settings -# For the full simulation, we need to define some additional parameters. -# These are the number of Monte Carlo runs and the number of experiments to be conducted per run. - -# The parameter `POINTS_PER_DIM` controls the number of points per dimension. -# Note that the searchspace will have `POINTS_PER_DIM**DIMENSION` many points. +# Before we start, let us collect a few general settings for the example: SMOKE_TEST = "SMOKE_TEST" in os.environ -N_MC_ITERATIONS = 2 if SMOKE_TEST else 5 -N_DOE_ITERATIONS = 2 if SMOKE_TEST else 5 -DIMENSION = 4 -BOUNDS = [(-2, 2), (-2, 2), (-2, 2), (-2, 2)] -POINTS_PER_DIM = 3 if SMOKE_TEST else 10 - -### Defining the test function +BATCH_SIZE = 1 +N_MC_ITERATIONS = 2 if SMOKE_TEST else 20 +N_DOE_ITERATIONS = 2 if SMOKE_TEST else 30 +DIMENSION = 1 if SMOKE_TEST else 10 +BOUNDS = (-1, 1) -# See [here](./../Searchspaces/continuous_space_custom_function.md) for details. +### Defining the Optimization Problem +# Now, we can define the scope of our optimization problem. Our goal is to optimize +# a high-dimensional quadratic function on a bounded input domain. We first define +# the corresponding inputs and output of the function: -def sum_of_squares(*x: float) -> float: - """Calculate the sum of squares.""" - res = 0 - for y in x: - res += y**2 - return res +parameters = [ + NumericalContinuousParameter(name=f"x_{k}", bounds=BOUNDS) for k in range(DIMENSION) +] +target = NumericalTarget(name="Target", mode="MIN") -### Creating the searchspace and the objective +# Based on the above, we construct the black-box callable to be optimized. +# Using the {func}`~baybe.simulation.lookup.label_columns` decorator, we can easily map +# the columns of the raw input/output arrays to our parameter and target objects, which +# creates the required dataframe-based lookup for the optimization loop: -# As we expect it to be the most common use case, we construct a purely discrete space here. -# Details on how to adjust this for other spaces can be found in the searchspace examples. -parameters = [ - NumericalDiscreteParameter( - name=f"x_{k+1}", - values=list(np.linspace(*BOUNDS[k], POINTS_PER_DIM)), - tolerance=0.01, - ) - for k in range(DIMENSION) -] +@label_columns([p.name for p in parameters], [target.name]) +def sum_of_squares(x: np.ndarray, /) -> np.ndarray: + """Calculate the sum of squares.""" + return (x**2).sum(axis=1, keepdims=True) -searchspace = SearchSpace.from_product(parameters=parameters) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) -### Constructing campaigns for the simulation loop +# What remains is to construct the search space and objective for the optimization: -# To simplify adjusting the example for other recommenders, we construct some recommender objects. -# For details on recommender objects, we refer to [`recommenders`](./../Basics/recommenders.md). +searchspace = SearchSpace.from_product(parameters=parameters) +objective = target.to_objective() -seq_greedy_EI_recommender = TwoPhaseMetaRecommender( - recommender=BotorchRecommender(acquisition_function="qEI"), -) -random_recommender = TwoPhaseMetaRecommender(recommender=RandomRecommender()) +### Creating the Campaigns -# We now create one campaign per recommender. +# We consider two optimization scenarios, each represented by its own campaign: +# * Optimization using the default recommender +# * A baseline using randomly generated recommendations -seq_greedy_EI_campaign = Campaign( +default_campaign = Campaign( searchspace=searchspace, - recommender=seq_greedy_EI_recommender, objective=objective, ) random_campaign = Campaign( searchspace=searchspace, - recommender=random_recommender, objective=objective, + recommender=RandomRecommender(), ) -### Performing the simulation loop +### Running the Optimization Loop -# We can now use the `simulate_scenarios` function to simulate a full experiment. -# Note that this function enables to run multiple scenarios by a single function call. -# For this, it is necessary to define a dictionary mapping scenario names to campaigns. +# Next, we simulate both scenarios using the +# {func}`~baybe.simulation.scenarios.simulate_scenarios` utility, +# which automatically executes several Monte Carlo simulations for each campaign: scenarios = { - "Sequential greedy EI": seq_greedy_EI_campaign, - "Random": random_campaign, + "Default Recommender": default_campaign, + "Random Recommender": random_campaign, } results = simulate_scenarios( scenarios, sum_of_squares, - batch_size=3, + batch_size=BATCH_SIZE, n_doe_iterations=N_DOE_ITERATIONS, n_mc_iterations=N_MC_ITERATIONS, ) -# The following lines plot the results and save the plot in run_analytical.png +### Plotting the Results + +# Finally, we compare the trajectories of the campaigns: -sns.lineplot(data=results, x="Num_Experiments", y="Target_CumBest", hue="Scenario") -plt.gcf().set_size_inches(24, 8) -plt.savefig("./run_analytical.png") +ax = sns.lineplot( + data=results, + marker="o", + markersize=10, + x="Num_Experiments", + y="Target_CumBest", + hue="Scenario", +) +create_example_plots(ax=ax, base_name="custom_analytical") diff --git a/examples/Backtesting/custom_analytical_dark.svg b/examples/Backtesting/custom_analytical_dark.svg new file mode 100644 index 000000000..a2987dc03 --- /dev/null +++ b/examples/Backtesting/custom_analytical_dark.svg @@ -0,0 +1,1434 @@ + + + + + + + + 2024-12-04T12:51:05.503249 + image/svg+xml + + + Matplotlib v3.9.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/Backtesting/custom_analytical_light.svg b/examples/Backtesting/custom_analytical_light.svg new file mode 100644 index 000000000..aa0fcb6fb --- /dev/null +++ b/examples/Backtesting/custom_analytical_light.svg @@ -0,0 +1,1434 @@ + + + + + + + + 2024-12-04T12:51:05.528476 + image/svg+xml + + + Matplotlib v3.9.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From e100edad5a6bb7c79c20bd124a04a9ca034a26d9 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 13:04:34 +0100 Subject: [PATCH 06/32] Update lookup mechanism in benchmark code --- benchmarks/domains/synthetic_2C1D_1C.py | 28 +++++++++++++++---------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/benchmarks/domains/synthetic_2C1D_1C.py b/benchmarks/domains/synthetic_2C1D_1C.py index abb8ab176..d60259e76 100644 --- a/benchmarks/domains/synthetic_2C1D_1C.py +++ b/benchmarks/domains/synthetic_2C1D_1C.py @@ -10,10 +10,11 @@ from baybe.campaign import Campaign from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter -from baybe.recommenders.pure.nonpredictive.sampling import RandomRecommender +from baybe.recommenders import RandomRecommender from baybe.searchspace import SearchSpace from baybe.simulation import simulate_scenarios -from baybe.targets import NumericalTarget, TargetMode +from baybe.simulation.lookup import label_columns +from baybe.targets import NumericalTarget from benchmarks.definition import ( Benchmark, ConvergenceExperimentSettings, @@ -23,8 +24,9 @@ from mpl_toolkits.mplot3d import Axes3D -def _lookup(z: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray: +def _lookup(arr: np.ndarray, /) -> np.ndarray: """Lookup that is used internally in the callable for the benchmark.""" + x, y, z = np.array_split(arr, 3, axis=1) try: assert np.all(-2 * pi <= x) and np.all(x <= 2 * pi) assert np.all(-2 * pi <= y) and np.all(y <= 2 * pi) @@ -60,24 +62,26 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame: NumericalDiscreteParameter("z", (1, 2, 3, 4)), ] - objective = NumericalTarget(name="target", mode=TargetMode.MAX).to_objective() - search_space = SearchSpace.from_product(parameters=parameters) + target = NumericalTarget(name="target", mode="MAX") + searchspace = SearchSpace.from_product(parameters=parameters) scenarios: dict[str, Campaign] = { "Random Recommender": Campaign( - searchspace=search_space, + searchspace=searchspace, recommender=RandomRecommender(), - objective=objective, + objective=target, ), "Default Recommender": Campaign( - searchspace=search_space, - objective=objective, + searchspace=searchspace, + objective=target, ), } + lookup = label_columns([p.name for p in parameters], [target.name])(_lookup) + return simulate_scenarios( scenarios, - _lookup, + lookup, batch_size=settings.batch_size, n_doe_iterations=settings.n_doe_iterations, n_mc_iterations=settings.n_mc_iterations, @@ -116,7 +120,9 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame: fig = plt.figure(figsize=(10, 10)) for i, z in enumerate(Z): ax: Axes3D = fig.add_subplot(2, 2, i + 1, projection="3d") - t_mesh = _lookup(np.asarray(z), x_mesh, y_mesh) + t_mesh = _lookup( + np.c_[x_mesh.ravel(), y_mesh.ravel(), np.repeat(z, x_mesh.size)] + ).reshape(x_mesh.shape) ax.plot_surface(x_mesh, y_mesh, t_mesh) plt.title(f"{z=}") From 2f8534cc3e9d2b550059a4f663587154282e64a4 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 13:08:26 +0100 Subject: [PATCH 07/32] Update CHANGELOG.md --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2969cf922..d7a06617c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,14 +5,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Breaking Changes +- Lookup callables for simulation are now expected to accept/return dataframes with + the corresponding parameter/target column labels + ### Added - `allow_missing` and `allow_extra` keyword arguments to `Objective.transform` - Example for a traditional mixture - `add_noise_to_perturb_degenerate_rows` utility - `benchmarks` subpackage for defining and running performance tests – `Campaign.toggle_discrete_candidates` to dynamically in-/exclude discrete candidates +- `filter_df` utility for filtering dataframe content - `DiscreteConstraint.get_valid` to conveniently access valid candidates - Functionality for persisting benchmarking results on S3 from a manual pipeline run +- `label_columns` decorator utility for convenient creation of lookup callables ### Changed - `SubstanceParameter` encodings are now computed exclusively with the From bc585c118bb07f827c3f2abdfe6d8da260acd8be Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 16:45:00 +0100 Subject: [PATCH 08/32] Drop multi-target.py example Will be replaced with a proper one showing the difference between single target, desirability, and Pareto optimization. --- examples/Backtesting/multi_target.py | 113 --------------------------- 1 file changed, 113 deletions(-) delete mode 100644 examples/Backtesting/multi_target.py diff --git a/examples/Backtesting/multi_target.py b/examples/Backtesting/multi_target.py deleted file mode 100644 index 88cdb89f4..000000000 --- a/examples/Backtesting/multi_target.py +++ /dev/null @@ -1,113 +0,0 @@ -## Example for full simulation loop using the multi target mode for custom analytic functions - -# This example shows how to use a multi target objective for a custom analytic function. -# It uses a desirability value to handle several targets. - -# This example assumes basic familiarity with BayBE, custom test functions and multiple targets. -# For further details, we thus refer to -# - [`campaign`](./../Basics/campaign.md) for a more general and basic example, -# - [`custom_analytical`](./custom_analytical.md) for custom test functions, and -# - [`desirability`](./../Multi_Target/desirability.md) for multiple targets. - -### Necessary imports for this example - -import os - -import numpy as np - -from baybe import Campaign -from baybe.objectives import DesirabilityObjective -from baybe.parameters import NumericalDiscreteParameter -from baybe.searchspace import SearchSpace -from baybe.simulation import simulate_scenarios -from baybe.targets import NumericalTarget - -### Parameters for a full simulation loop - -# For the full simulation, we need to define some additional parameters. -# These are the number of Monte Carlo runs and the number of experiments to be conducted per run. - -SMOKE_TEST = "SMOKE_TEST" in os.environ - -N_MC_ITERATIONS = 2 if SMOKE_TEST else 5 -N_DOE_ITERATIONS = 2 if SMOKE_TEST else 4 -BATCH_SIZE = 1 if SMOKE_TEST else 2 -DIMENSION = 4 -BOUNDS = [(-2, 2), (-2, 2), (-2, 2), (-2, 2)] -POINTS_PER_DIM = 3 if SMOKE_TEST else 10 - - -### Defining the test function - -# See [`custom_analytical`](./custom_analytical.md) for details. - - -def sum_of_squares(*x: float) -> tuple[float, float]: - """Calculate the sum of squares.""" - res = 0 - for y in x: - res += y**2 - return res, 2 * res**2 - 1 - - -### Creating the searchspace - -# In this example, we construct a purely discrete space with 10 points per dimension. - -parameters = [ - NumericalDiscreteParameter( - name=f"x_{k+1}", - values=list(np.linspace(*BOUNDS[k], POINTS_PER_DIM)), - tolerance=0.01, - ) - for k in range(DIMENSION) -] - -searchspace = SearchSpace.from_product(parameters=parameters) - - -### Creating multiple target object - -# The multi target mode is handled when creating the objective object. -# Thus, we first need to define the different targets. -# We use two targets here. -# The first target is maximized and the second target is minimized during the optimization process. - -Target_1 = NumericalTarget( - name="Target_1", mode="MAX", bounds=(0, 100), transformation="LINEAR" -) -Target_2 = NumericalTarget( - name="Target_2", mode="MIN", bounds=(0, 100), transformation="LINEAR" -) - - -### Creating the objective object - -# We collect the two targets in a list and use this list to construct the objective. - -targets = [Target_1, Target_2] - -objective = DesirabilityObjective( - targets=targets, - weights=[20, 30], - scalarizer="MEAN", -) - - -### Constructing a campaign and performing the simulation loop - -campaign = Campaign(searchspace=searchspace, objective=objective) - -# We can now use the `simulate_scenarios` function to simulate a full experiment. - -scenarios = {"BayBE": campaign} - -results = simulate_scenarios( - scenarios, - sum_of_squares, - batch_size=BATCH_SIZE, - n_doe_iterations=N_DOE_ITERATIONS, - n_mc_iterations=N_MC_ITERATIONS, -) - -print(results) From 5b112ddecf7c57fc802836662bafa3f71c7ef8e7 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 16:46:41 +0100 Subject: [PATCH 09/32] Drop hybrid.py example The current version provides minimal additional insights compared to what is already presented in `custom_analytical.py`. Instead of just reiterating the same logic, it should rather compare different hybrid optimizers. For now, it is dropped because refactoring is not worth it. --- examples/Backtesting/hybrid.py | 172 --------------------------------- 1 file changed, 172 deletions(-) delete mode 100644 examples/Backtesting/hybrid.py diff --git a/examples/Backtesting/hybrid.py b/examples/Backtesting/hybrid.py deleted file mode 100644 index 932346d70..000000000 --- a/examples/Backtesting/hybrid.py +++ /dev/null @@ -1,172 +0,0 @@ -## Example for full simulation loop using a custom analytical test function in a hybrid space - -# This example shows a simulation loop for a single target with a custom test function as lookup. -# Most importantly, it demonstrates the creation of a custom hybrid searchspace. - -# This examples assumes some basic familiarity with using BayBE and the lookup mechanism. -# We refer to [`campaign`](./../Basics/campaign.md) for a more basic example resp. -# to [`custom_analytical`](./custom_analytical.md) for details on the lookup mechanism. - -### Necessary imports for this example - -import os - -import matplotlib.pyplot as plt -import numpy as np -import seaborn as sns - -from baybe import Campaign -from baybe.objectives import SingleTargetObjective -from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter -from baybe.recommenders import ( - BotorchRecommender, - NaiveHybridSpaceRecommender, - RandomRecommender, - TwoPhaseMetaRecommender, -) -from baybe.searchspace import SearchSpace -from baybe.simulation import simulate_scenarios -from baybe.targets import NumericalTarget - -### Parameters for a full simulation loop - -# For the full simulation, we need to define some additional parameters. -# These are the number of Monte Carlo runs and the number of experiments to be conducted per run. -# `POINTS_PER_DIM` denotes how many points each discrete dimension should contain. - -SMOKE_TEST = "SMOKE_TEST" in os.environ - -N_MC_ITERATIONS = 2 if SMOKE_TEST else 5 -N_DOE_ITERATIONS = 2 if SMOKE_TEST else 5 -POINTS_PER_DIM = 3 if SMOKE_TEST else 6 - - -### Defining the test function. - - -# See [`here`](./custom_analytical.md) for details on the custom analytical test function. - - -def sum_of_squares(*x: float) -> float: - """Calculate the sum of squares.""" - res = 0 - for y in x: - res += y**2 - return res - - -# For our actual experiment, we need to specify the number of dimension that we want to use. -# This is necessary to know for the creation of the parameters. -# Similarly, it is necessary to state the bounds of the parameters. -# These should be provided as a list of two-dimensional tuples. - -DIMENSION = 4 -BOUNDS = [(-2, 2), (-2, 2), (-2, 2), (-2, 2)] - -### Constructing the hybrid searchspace - -# Our goal is to construct a hybrid searchspace containing discrete and continuous parameters. -# We thus need to specify which indices should be discrete and which should be continuous. - -CONT_INDICES = [0, 1] -DISC_INDICES = [2, 3] - -# This code verifies whether the provided indices agree with `DIMENSION`. - -if set(CONT_INDICES + DISC_INDICES) != set(range(DIMENSION)): - raise ValueError( - "Either the intersection between CONT_IND and DISC_IND is not empty or your " - "indices do not match." - ) - - -# Construct the continuous parameters as NumericContinuous parameters. - -cont_parameters = [ - NumericalContinuousParameter( - name=f"x_{k+1}", - bounds=(BOUNDS[k]), - ) - for k in CONT_INDICES -] - -# Construct the discrete parameters as `NumericalDiscreteParameters`. - -disc_parameters = [ - NumericalDiscreteParameter( - name=f"x_{k+1}", - values=list(np.linspace(*BOUNDS[k], POINTS_PER_DIM)), - tolerance=0.01, - ) - for k in DISC_INDICES -] - -# Concatenate the continuous and discrete parameters. - -parameters = cont_parameters + disc_parameters - -# Construct searchspace and objective. -searchspace = SearchSpace.from_product(parameters=parameters) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) - -### Constructing campaigns for the simulation loop - -# This example compares three different available hybrid recommenders: -# The `BotorchRecommender`, the `NaiveHybridSpaceRecommender` and the `RandomRecommender`. -# For each of them, we initialize one recommender object. -# Note that it is possible to further specify the behavior of the `BotorchRecommender`. -# Using the two keywords `hybrid_sampler` and `sampling_percentage`, one can control -# - how much of the discrete subspace should be explored -# - how these points should be sampled. - -# Note that the recommender performs one optimization of the continuous subspace per sampled point. -# We thus recommend to keep this parameter rather low. - -seq_greedy_recommender = TwoPhaseMetaRecommender( - recommender=BotorchRecommender(hybrid_sampler="FPS", sampling_percentage=0.3), -) -naive_hybrid_recommender = TwoPhaseMetaRecommender( - recommender=NaiveHybridSpaceRecommender() -) -random_recommender = TwoPhaseMetaRecommender(recommender=RandomRecommender()) - -# We now create one campaign per recommender. - -seq_greedy_campaign = Campaign( - searchspace=searchspace, - recommender=seq_greedy_recommender, - objective=objective, -) -naive_hybrid_campaign = Campaign( - searchspace=searchspace, - recommender=naive_hybrid_recommender, - objective=objective, -) -random_campaign = Campaign( - searchspace=searchspace, - recommender=random_recommender, - objective=objective, -) - -# We can now use the `simulate_scenarios` function to simulate a full experiment. -# Note that this function enables to run multiple scenarios by a single function call. -# For this, it is necessary to define a dictionary mapping scenario names to campaigns. - -scenarios = { - "Sequential greedy": seq_greedy_campaign, - "Naive hybrid": naive_hybrid_campaign, - "Random": random_campaign, -} -results = simulate_scenarios( - scenarios, - sum_of_squares, - batch_size=2, - n_doe_iterations=N_DOE_ITERATIONS, - n_mc_iterations=N_MC_ITERATIONS, -) - -# The following lines plot the results and save the plot in run_analytical.png - -sns.lineplot(data=results, x="Num_Experiments", y="Target_CumBest", hue="Scenario") -plt.gcf().set_size_inches(24, 8) -plt.savefig("./run_hybrid.png") From 2f7dd1cd1cdb6eaee43abb48f56feb96be84fc58 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 4 Dec 2024 17:22:56 +0100 Subject: [PATCH 10/32] Remove outdated references --- docs/userguide/recommenders.md | 6 +----- examples/Basics/recommenders.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/docs/userguide/recommenders.md b/docs/userguide/recommenders.md index 351c25d8e..4b8bed542 100644 --- a/docs/userguide/recommenders.md +++ b/docs/userguide/recommenders.md @@ -53,15 +53,11 @@ for various acquisition functions. `n_restarts` and `n_raw_samples` keywords. For details, please refer to [BotorchRecommender](baybe.recommenders.pure.bayesian.botorch.BotorchRecommender). - An example on using this recommender in a hybrid space can be found - [here](./../../examples/Backtesting/hybrid). - * The **[`NaiveHybridSpaceRecommender`](baybe.recommenders.naive.NaiveHybridSpaceRecommender)** can be applied to all search spaces, but is intended to be used in hybrid spaces. This recommender combines individual recommenders for the continuous and the discrete subspaces. It independently optimizes each subspace and consolidates the best results - to generate a candidate for the original hybrid space. An example on using this - recommender in a hybrid space can be found [here](./../../examples/Backtesting/hybrid). + to generate a candidate for the original hybrid space. ### Clustering Recommenders diff --git a/examples/Basics/recommenders.py b/examples/Basics/recommenders.py index 2653863e7..98f197212 100644 --- a/examples/Basics/recommenders.py +++ b/examples/Basics/recommenders.py @@ -116,10 +116,6 @@ print(recommender) -# Note that there are the additional keywords `hybrid_sampler` and `sampling_percentag`. -# Their meaning and how to use and define it are explained in the hybrid backtesting example. -# We thus refer to [`hybrid`](./../Backtesting/hybrid.md) for details on these. - ### Example Searchspace and objective parameters # We use the same data used in the [`campaign`](./campaign.md) example. From 9596fbc23e7c455f7437c831d812d9c80bb21e29 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 5 Dec 2024 17:20:27 +0100 Subject: [PATCH 11/32] Remove botorch_function_wrapper --- CHANGELOG.md | 3 +++ baybe/utils/botorch_wrapper.py | 29 ----------------------------- 2 files changed, 3 insertions(+), 29 deletions(-) delete mode 100644 baybe/utils/botorch_wrapper.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d7a06617c..d425c7ffd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Crash when using `ContinuousCardinalityConstraint` caused by an unintended interplay between constraints and dropped parameters yielding empty parameter sets +### Removed +- `botorch_function_wrapper` utility for creating lookup callables + ### Deprecations - Passing a dataframe via the `data` argument to `Objective.transform` is no longer possible. The dataframe must now be passed as positional argument. diff --git a/baybe/utils/botorch_wrapper.py b/baybe/utils/botorch_wrapper.py deleted file mode 100644 index 74a9defc7..000000000 --- a/baybe/utils/botorch_wrapper.py +++ /dev/null @@ -1,29 +0,0 @@ -"""A wrapper class for synthetic BoTorch test functions.""" - -import torch -from botorch.test_functions import SyntheticTestFunction - -from baybe.utils.torch import DTypeFloatTorch - - -def botorch_function_wrapper(test_function: SyntheticTestFunction): - """Turn a BoTorch test function into a format accepted by lookup in simulations. - - See :mod:`baybe.simulation` for details. - - Args: - test_function: The synthetic test function from BoTorch. See - https://botorch.org/api/test_functions.html. - - Returns: - A wrapped version of the provided function. - """ - - def wrapper(*x: float) -> float: - # Cast the provided list of floats to a tensor. - x_tensor = torch.tensor(x, dtype=DTypeFloatTorch) - result = test_function.forward(x_tensor) - # We do not need to return a tuple here. - return float(result) - - return wrapper From f976b552ba7ce8482a6f57d8cfbb9c722cd22645 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 6 Dec 2024 08:13:51 +0100 Subject: [PATCH 12/32] Explicitly convert target to objective --- benchmarks/domains/synthetic_2C1D_1C.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/domains/synthetic_2C1D_1C.py b/benchmarks/domains/synthetic_2C1D_1C.py index d60259e76..ae32f9371 100644 --- a/benchmarks/domains/synthetic_2C1D_1C.py +++ b/benchmarks/domains/synthetic_2C1D_1C.py @@ -64,16 +64,17 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame: target = NumericalTarget(name="target", mode="MAX") searchspace = SearchSpace.from_product(parameters=parameters) + objective = target.to_objective() scenarios: dict[str, Campaign] = { "Random Recommender": Campaign( searchspace=searchspace, recommender=RandomRecommender(), - objective=target, + objective=objective, ), "Default Recommender": Campaign( searchspace=searchspace, - objective=target, + objective=objective, ), } From 47676920f75c16a20eb0a256cb495d98826fa05d Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 08:40:47 +0100 Subject: [PATCH 13/32] Use dataframe-based lookup callable in user-facing code --- CHANGELOG.md | 1 - baybe/simulation/__init__.py | 2 -- baybe/simulation/lookup.py | 41 +---------------------- benchmarks/domains/synthetic_2C1D_1C.py | 13 ++++--- examples/Backtesting/custom_analytical.py | 17 ++++------ 5 files changed, 17 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d425c7ffd..75559196b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `filter_df` utility for filtering dataframe content - `DiscreteConstraint.get_valid` to conveniently access valid candidates - Functionality for persisting benchmarking results on S3 from a manual pipeline run -- `label_columns` decorator utility for convenient creation of lookup callables ### Changed - `SubstanceParameter` encodings are now computed exclusively with the diff --git a/baybe/simulation/__init__.py b/baybe/simulation/__init__.py index e27df1999..bfde8efc5 100644 --- a/baybe/simulation/__init__.py +++ b/baybe/simulation/__init__.py @@ -16,12 +16,10 @@ """ from baybe.simulation.core import simulate_experiment -from baybe.simulation.lookup import label_columns from baybe.simulation.scenarios import simulate_scenarios from baybe.simulation.transfer_learning import simulate_transfer_learning __all__ = [ - "label_columns", "simulate_experiment", "simulate_scenarios", "simulate_transfer_learning", diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py index 10aa2b0e6..d58c9aa05 100644 --- a/baybe/simulation/lookup.py +++ b/baybe/simulation/lookup.py @@ -2,9 +2,8 @@ from __future__ import annotations -import functools import logging -from collections.abc import Callable, Collection, Sequence +from collections.abc import Callable, Collection from typing import Literal import numpy as np @@ -139,41 +138,3 @@ def _look_up_targets_from_dataframe( # Add the lookup values queries.loc[:, target_names] = np.asarray(all_match_vals) - - -def label_columns( - input_labels: Sequence[str], output_labels: Sequence[str] -) -> Callable: - """Create a decorator for labeling the inputs and outputs of array-based callables. - - The decorator transforms a callable designed to work with unlabelled arrays such - that it can operate with dataframes instead. The original callable is expected to - accept and return two-dimensional arrays. When decorated, the callable accepts and - returns dataframes whose columns are mapped to the corresponding arrays based on the - specified label sequences. - - Args: - input_labels: The sequence of input labels mapping the columns of the input - dataframe to columns of the input array in the specified order. - output_labels: The sequence of output labels mapping the columns of the output - dataframe to columns of the output array in the specified order. - - Returns: - The decorator for the given input and output labels. - """ - - def decorator( - fn: Callable[[np.ndarray], np.ndarray], - ) -> Callable[[pd.DataFrame], pd.DataFrame]: - """Turn an array-based callable into a dataframe-based callable.""" - - @functools.wraps(fn) - def wrapper(df: pd.DataFrame, /) -> pd.DataFrame: - """Translate to/from an array-based callable using dataframes.""" - array_in = df[input_labels].to_numpy() - array_out = fn(array_in) - return pd.DataFrame(array_out, columns=output_labels, index=df.index) - - return wrapper - - return decorator diff --git a/benchmarks/domains/synthetic_2C1D_1C.py b/benchmarks/domains/synthetic_2C1D_1C.py index ae32f9371..94c475ac4 100644 --- a/benchmarks/domains/synthetic_2C1D_1C.py +++ b/benchmarks/domains/synthetic_2C1D_1C.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING import numpy as np +import pandas as pd from numpy import pi, sin, sqrt from pandas import DataFrame @@ -13,7 +14,6 @@ from baybe.recommenders import RandomRecommender from baybe.searchspace import SearchSpace from baybe.simulation import simulate_scenarios -from baybe.simulation.lookup import label_columns from baybe.targets import NumericalTarget from benchmarks.definition import ( Benchmark, @@ -25,7 +25,7 @@ def _lookup(arr: np.ndarray, /) -> np.ndarray: - """Lookup that is used internally in the callable for the benchmark.""" + """Numpy-based lookup callable defining the objective function.""" x, y, z = np.array_split(arr, 3, axis=1) try: assert np.all(-2 * pi <= x) and np.all(x <= 2 * pi) @@ -42,6 +42,13 @@ def _lookup(arr: np.ndarray, /) -> np.ndarray: ) +def lookup(df: pd.DataFrame, /) -> pd.DataFrame: + """Dataframe-based lookup callable used as the loop-closing element.""" + return pd.DataFrame( + _lookup(df[["x", "y", "z"]].to_numpy()), columns=["target"], index=df.index + ) + + def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame: """Hybrid synthetic test function. @@ -78,8 +85,6 @@ def synthetic_2C1D_1C(settings: ConvergenceExperimentSettings) -> DataFrame: ), } - lookup = label_columns([p.name for p in parameters], [target.name])(_lookup) - return simulate_scenarios( scenarios, lookup, diff --git a/examples/Backtesting/custom_analytical.py b/examples/Backtesting/custom_analytical.py index 1bef726b3..3d16abed9 100644 --- a/examples/Backtesting/custom_analytical.py +++ b/examples/Backtesting/custom_analytical.py @@ -9,14 +9,14 @@ import os -import numpy as np +import pandas as pd import seaborn as sns from baybe import Campaign from baybe.parameters.numerical import NumericalContinuousParameter from baybe.recommenders import RandomRecommender from baybe.searchspace import SearchSpace -from baybe.simulation import label_columns, simulate_scenarios +from baybe.simulation import simulate_scenarios from baybe.targets import NumericalTarget from baybe.utils.plotting import create_example_plots @@ -44,16 +44,13 @@ target = NumericalTarget(name="Target", mode="MIN") -# Based on the above, we construct the black-box callable to be optimized. -# Using the {func}`~baybe.simulation.lookup.label_columns` decorator, we can easily map -# the columns of the raw input/output arrays to our parameter and target objects, which -# creates the required dataframe-based lookup for the optimization loop: +# Based on the above, we construct the black-box callable to be optimized, which +# provides the lookup mechanism for closing the optimization loop: -@label_columns([p.name for p in parameters], [target.name]) -def sum_of_squares(x: np.ndarray, /) -> np.ndarray: +def blackbox(df: pd.DataFrame, /) -> pd.DataFrame: """Calculate the sum of squares.""" - return (x**2).sum(axis=1, keepdims=True) + return (df[[p.name for p in parameters]] ** 2).sum(axis=1).to_frame(target.name) # What remains is to construct the search space and objective for the optimization: @@ -89,7 +86,7 @@ def sum_of_squares(x: np.ndarray, /) -> np.ndarray: } results = simulate_scenarios( scenarios, - sum_of_squares, + blackbox, batch_size=BATCH_SIZE, n_doe_iterations=N_DOE_ITERATIONS, n_mc_iterations=N_MC_ITERATIONS, From e9b5db0171ad7756998d8e83f8081fde1f6cdff4 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 09:22:40 +0100 Subject: [PATCH 14/32] Add back continuous search space example --- examples/Searchspaces/continuous_space.py | 48 +++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 examples/Searchspaces/continuous_space.py diff --git a/examples/Searchspaces/continuous_space.py b/examples/Searchspaces/continuous_space.py new file mode 100644 index 000000000..a22be7c33 --- /dev/null +++ b/examples/Searchspaces/continuous_space.py @@ -0,0 +1,48 @@ +## Creating Continuous Search Spaces + +# This example illustrates several ways to create continuous spaces space. + +### Imports + +import numpy as np + +from baybe.parameters import NumericalContinuousParameter +from baybe.searchspace import SearchSpace, SubspaceContinuous + +### Settings + +# We begin by defining the continuous parameters that span our space: + +DIMENSION = 4 +BOUNDS = (-1, 1) + +parameters = [ + NumericalContinuousParameter(name=f"x_{k+1}", bounds=BOUNDS) + for k in range(DIMENSION) +] + +# From these parameter objects, we can now construct a continuous subspace. +# Let us draw some samples from it and verify that they are within the bounds: + +subspace = SubspaceContinuous(parameters) +samples = subspace.sample_uniform(10) +print(samples) +assert np.all(samples >= BOUNDS[0]) and np.all(samples <= BOUNDS[1]) + +# There are several ways we can turn the above objects into a search space. +# This provides a lot of flexibility depending on the context: + +# Using conversion: +searchspace1 = SubspaceContinuous(parameters).to_searchspace() + +# Explicit attribute assignment via the regular search space constructor: +searchspace2 = SearchSpace(continuous=SubspaceContinuous(parameters)) + +# Using an alternative search space constructor: +searchspace3 = SearchSpace.from_product(parameters=parameters) + + +# No matter which version we choose, we can be sure that the resulting search space +# objects are equivalent: + +assert searchspace1 == searchspace2 == searchspace3 From 3bedbc3d4b6d6be26cb90839ffaa2b57c100608b Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 09:39:25 +0100 Subject: [PATCH 15/32] Simplify inplace column assignment --- baybe/simulation/lookup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py index d58c9aa05..7bda503d4 100644 --- a/baybe/simulation/lookup.py +++ b/baybe/simulation/lookup.py @@ -82,8 +82,7 @@ def _look_up_targets_from_callable( ) -> None: """Look up target values by querying a callable.""" df_targets = lookup(queries) - for col in df_targets: - queries[col] = df_targets[col] + queries[df_targets.columns] = df_targets def _look_up_targets_from_dataframe( From 2409f75a20820d057effbb223b0a6494b1769a53 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 13:00:57 +0100 Subject: [PATCH 16/32] Replace botorch_function_wrapper calls --- baybe/utils/dataframe.py | 49 ++++++++++++++++++- .../Constraints_Continuous/hybrid_space.py | 31 ++++++------ .../linear_constraints.py | 29 ++++++----- .../probability_of_improvement.py | 21 ++++---- examples/Searchspaces/discrete_space.py | 32 +++++------- examples/Searchspaces/hybrid_space.py | 30 ++++++------ examples/Transfer_Learning/backtesting.py | 24 +++++---- .../basic_transfer_learning.py | 18 ++++--- 8 files changed, 140 insertions(+), 94 deletions(-) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 96e0e1dea..640eaacb3 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -2,8 +2,9 @@ from __future__ import annotations +import functools import logging -from collections.abc import Collection, Iterable, Sequence +from collections.abc import Callable, Collection, Iterable, Sequence from typing import TYPE_CHECKING, Literal, TypeVar, overload import numpy as np @@ -21,6 +22,8 @@ from baybe.targets.base import Target _T = TypeVar("_T", bound=Parameter | Target) + _ArrayLike = TypeVar("_ArrayLike", np.ndarray, Tensor) + # Logging _logger = logging.getLogger(__name__) @@ -678,3 +681,47 @@ def filter_df( out.index.name = index_name return out + + +def label_input_and_output_columns( + input_labels: Sequence[str], output_labels: Sequence[str], use_torch: bool = False +) -> Callable: + """Create a decorator for labeling the inputs and outputs of array-based callables. + + The decorator transforms a callable designed to work with unlabelled arrays such + that it can operate with dataframes instead. The original callable is expected to + accept and return two-dimensional arrays. When decorated, the callable accepts and + returns dataframes whose columns are mapped to the corresponding arrays based on the + specified label sequences. + + Args: + input_labels: The sequence of labels for the input columns. + output_labels: The sequence of labels for the output columns. + use_torch: Flag indicating if the callable is to be called with a numpy array + or with a torch tensor. + + Returns: + The decorator for the given input and output labels. + """ + + def decorator( + fn: Callable[[_ArrayLike], _ArrayLike], / + ) -> Callable[[pd.DataFrame], pd.DataFrame]: + """Turn an array-based callable into a dataframe-based callable.""" + + @functools.wraps(fn) + def wrapper(df: pd.DataFrame, /) -> pd.DataFrame: + """Translate to/from an array-based callable using dataframes.""" + array_in = df[list(input_labels)].to_numpy() + if use_torch: + import torch + + with torch.no_grad(): + array_out = fn(torch.from_numpy(array_in)).numpy() + else: + array_out = fn(array_in) + return pd.DataFrame(array_out, columns=list(output_labels), index=df.index) + + return wrapper + + return decorator diff --git a/examples/Constraints_Continuous/hybrid_space.py b/examples/Constraints_Continuous/hybrid_space.py index 174b25009..f88d873aa 100644 --- a/examples/Constraints_Continuous/hybrid_space.py +++ b/examples/Constraints_Continuous/hybrid_space.py @@ -2,8 +2,7 @@ # Example for optimizing a synthetic test functions in a hybrid space with one # constraint in the discrete subspace and one constraint in the continuous subspace. -# All test functions that are available in BoTorch are also available here and wrapped -# via the `botorch_function_wrapper`. +# All test functions that are available in BoTorch are also available here. # This example assumes some basic familiarity with using BayBE. # We thus refer to [`campaign`](./../Basics/campaign.md) for a basic example. # Also, there is a large overlap with other examples with regards to using the test function. @@ -14,6 +13,7 @@ ### Necessary imports for this example import numpy as np +import pandas as pd from botorch.test_functions import Rastrigin from baybe import Campaign @@ -22,11 +22,10 @@ DiscreteSumConstraint, ThresholdCondition, ) -from baybe.objectives import SingleTargetObjective from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper +from baybe.utils.dataframe import label_input_and_output_columns ### Defining the test function @@ -48,7 +47,6 @@ DIMENSION = TestFunctionClass().dim BOUNDS = TestFunction.bounds -WRAPPED_FUNCTION = botorch_function_wrapper(test_function=TestFunction) ### Creating the searchspace and the objective @@ -90,7 +88,14 @@ ] searchspace = SearchSpace.from_product(parameters=parameters, constraints=constraints) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) +target = NumericalTarget(name="Target", mode="MIN") +objective = target.to_objective() + +### Wrap the test function as a dataframe-based lookup callable + +lookup = label_input_and_output_columns( + [p.name for p in parameters], [target.name], use_torch=True +)(TestFunction) ### Construct the campaign and run some iterations @@ -102,17 +107,11 @@ BATCH_SIZE = 5 N_ITERATIONS = 2 -for k in range(N_ITERATIONS): +for _ in range(N_ITERATIONS): recommendation = campaign.recommend(batch_size=BATCH_SIZE) - - # target value are looked up via the botorch wrapper - target_values = [] - for index, row in recommendation.iterrows(): - target_values.append(WRAPPED_FUNCTION(*row.to_list())) - - recommendation["Target"] = target_values - - campaign.add_measurements(recommendation) + lookup_values = lookup(recommendation) + measurements = pd.concat([recommendation, lookup_values], axis=1) + campaign.add_measurements(measurements) ### Verify the constraints measurements = campaign.measurements diff --git a/examples/Constraints_Continuous/linear_constraints.py b/examples/Constraints_Continuous/linear_constraints.py index c51d2d0c5..928b0fb93 100644 --- a/examples/Constraints_Continuous/linear_constraints.py +++ b/examples/Constraints_Continuous/linear_constraints.py @@ -2,8 +2,7 @@ # Example for optimizing a synthetic test functions in a continuous space with linear # constraints. -# All test functions that are available in BoTorch are also available here and wrapped -# via the `botorch_function_wrapper`. +# All test functions that are available in BoTorch are also available here. # This example assumes some basic familiarity with using BayBE. # We thus refer to [`campaign`](./../Basics/campaign.md) for a basic example. # Also, there is a large overlap with other examples with regards to using the test function. @@ -15,15 +14,15 @@ import os import numpy as np +import pandas as pd from botorch.test_functions import Rastrigin from baybe import Campaign from baybe.constraints import ContinuousLinearConstraint -from baybe.objectives import SingleTargetObjective from baybe.parameters import NumericalContinuousParameter from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper +from baybe.utils.dataframe import label_input_and_output_columns ### Defining the test function @@ -39,7 +38,6 @@ DIMENSION = TestFunctionClass().dim BOUNDS = TestFunction.bounds -WRAPPED_FUNCTION = botorch_function_wrapper(test_function=TestFunction) ### Creating the searchspace and the objective @@ -76,7 +74,14 @@ ] searchspace = SearchSpace.from_product(parameters=parameters, constraints=constraints) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) +target = NumericalTarget(name="Target", mode="MIN") +objective = target.to_objective() + +### Wrap the test function as a dataframe-based lookup callable + +lookup = label_input_and_output_columns( + [p.name for p in parameters], [target.name], use_torch=True +)(TestFunction) ### Construct the campaign and run some iterations @@ -94,15 +99,9 @@ for k in range(N_ITERATIONS): recommendation = campaign.recommend(batch_size=BATCH_SIZE) - - # target value are looked up via the botorch wrapper - target_values = [] - for index, row in recommendation.iterrows(): - target_values.append(WRAPPED_FUNCTION(*row.to_list())) - - recommendation["Target"] = target_values - - campaign.add_measurements(recommendation) + lookup_values = lookup(recommendation) + measurements = pd.concat([recommendation, lookup_values], axis=1) + campaign.add_measurements(measurements) ### Verify the constraints diff --git a/examples/Custom_Hooks/probability_of_improvement.py b/examples/Custom_Hooks/probability_of_improvement.py index 6f8767dc4..6cdd96687 100644 --- a/examples/Custom_Hooks/probability_of_improvement.py +++ b/examples/Custom_Hooks/probability_of_improvement.py @@ -27,7 +27,6 @@ from baybe.acquisition import ProbabilityOfImprovement from baybe.campaign import Campaign from baybe.objectives.base import Objective -from baybe.objectives.single import SingleTargetObjective from baybe.parameters import NumericalDiscreteParameter from baybe.recommenders import ( BotorchRecommender, @@ -38,8 +37,7 @@ from baybe.surrogates import GaussianProcessSurrogate from baybe.targets import NumericalTarget from baybe.utils.basic import register_hooks -from baybe.utils.botorch_wrapper import botorch_function_wrapper -from baybe.utils.dataframe import to_tensor +from baybe.utils.dataframe import label_input_and_output_columns, to_tensor from baybe.utils.plotting import create_example_plots from baybe.utils.random import set_random_seed @@ -122,7 +120,6 @@ def extract_pi( # With all preparations completed, we can set up the campaign: test_function = Hartmann(dim=DIMENSION) -wrapped_function = botorch_function_wrapper(test_function=test_function) discrete_params = [ NumericalDiscreteParameter( @@ -133,20 +130,26 @@ def extract_pi( ] searchspace = SearchSpace.from_product(parameters=discrete_params) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) +target = NumericalTarget(name="Target", mode="MIN") +objective = target.to_objective() campaign = Campaign( searchspace=searchspace, recommender=recommender, objective=objective, ) + # Now, we perform a couple of experimental iterations with the active hook: +lookup = label_input_and_output_columns( + [p.name for p in discrete_params], [target.name], use_torch=True +)(test_function) + for i in range(N_DOE_ITERATIONS): - recommendation = campaign.recommend(BATCH_SIZE) - target_values = recommendation.apply(lambda x: wrapped_function(*x.values), axis=1) - recommendation["Target"] = target_values - campaign.add_measurements(recommendation) + recommendation = campaign.recommend(batch_size=BATCH_SIZE) + lookup_values = lookup(recommendation) + measurements = pd.concat([recommendation, lookup_values], axis=1) + campaign.add_measurements(measurements) ### Plotting the Results diff --git a/examples/Searchspaces/discrete_space.py b/examples/Searchspaces/discrete_space.py index f69f6ac95..9f87a783d 100644 --- a/examples/Searchspaces/discrete_space.py +++ b/examples/Searchspaces/discrete_space.py @@ -6,14 +6,14 @@ ### Necessary imports for this example import numpy as np +import pandas as pd from botorch.test_functions import Rastrigin from baybe import Campaign -from baybe.objectives import SingleTargetObjective from baybe.parameters import NumericalDiscreteParameter from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper +from baybe.utils.dataframe import label_input_and_output_columns ### Defining the test function @@ -23,8 +23,7 @@ # In addition, the dimension is then adjusted automatically. # Note that choosing a different test function requires to change the `import` statement. -# All test functions that are available in BoTorch are also available here and are later wrapped -# via the `botorch_function_wrapper`. +# All test functions that are available in BoTorch are also available here. DIMENSION = 4 TestFunctionClass = Rastrigin @@ -53,11 +52,6 @@ BOUNDS = TestFunction.bounds -# It is necessary to "translate" the BoTorch function such that it can be used by BayBE. -# This is done by using the `botorch_function_wrapper` function. - -WRAPPED_FUNCTION = botorch_function_wrapper(test_function=TestFunction) - ### Creating the searchspace and the objective # In this example, we construct a purely discrete space. @@ -79,7 +73,8 @@ ] searchspace = SearchSpace.from_product(parameters=parameters) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) +target = NumericalTarget(name="Target", mode="MIN") +objective = target.to_objective() ### Constructing the campaign and performing a recommendation @@ -93,19 +88,16 @@ recommendation = campaign.recommend(batch_size=BATCH_SIZE) # Evaluate the test function. -# Note that we need iterate through the rows of the recommendation. -# Furthermore, we need to interpret the row as a list. - -target_values = [] -for index, row in recommendation.iterrows(): - target_values.append(WRAPPED_FUNCTION(*row.to_list())) -# We add an additional column with the calculated target values. +lookup = label_input_and_output_columns( + [p.name for p in parameters], [target.name], use_torch=True +)(TestFunction) -recommendation["Target"] = target_values +lookup_values = lookup(recommendation) +measurements = pd.concat([recommendation, lookup_values], axis=1) # Here, we inform the campaign about our measurement. -campaign.add_measurements(recommendation) +campaign.add_measurements(measurements) print("\n\nRecommended experiments with measured values: ") -print(recommendation) +print(measurements) diff --git a/examples/Searchspaces/hybrid_space.py b/examples/Searchspaces/hybrid_space.py index c0d586745..d2556413d 100644 --- a/examples/Searchspaces/hybrid_space.py +++ b/examples/Searchspaces/hybrid_space.py @@ -10,15 +10,15 @@ ### Necessary imports for this example import numpy as np +import pandas as pd from botorch.test_functions import Rastrigin from baybe import Campaign -from baybe.objectives import SingleTargetObjective from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter from baybe.recommenders import NaiveHybridSpaceRecommender, TwoPhaseMetaRecommender from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper +from baybe.utils.dataframe import label_input_and_output_columns ### Defining the test function and the hybrid dimensions @@ -70,7 +70,6 @@ ) BOUNDS = TestFunction.bounds -WRAPPED_FUNCTION = botorch_function_wrapper(test_function=TestFunction) ### Constructing the hybrid searchspace @@ -100,7 +99,15 @@ ] searchspace = SearchSpace.from_product(parameters=disc_parameters + cont_parameters) -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) +target = NumericalTarget(name="Target", mode="MIN") +objective = target.to_objective() + +### Wrap the test function as a dataframe-based lookup callable + +lookup = label_input_and_output_columns( + searchspace.parameter_names, [target.name], use_torch=True +)(TestFunction) + ### Constructing hybrid recommenders @@ -124,19 +131,12 @@ recommendation = campaign.recommend(batch_size=BATCH_SIZE) # Evaluate the test function. -# Note that we need iterate through the rows of the recommendation. -# Furthermore, we need to interpret the row as a list. - -target_values = [] -for index, row in recommendation.iterrows(): - target_values.append(WRAPPED_FUNCTION(*row.to_list())) - -# We add an additional column with the calculated target values. -recommendation["Target"] = target_values +lookup_values = lookup(recommendation) +measurements = pd.concat([recommendation, lookup_values], axis=1) # Here, we inform the campaign about our measurement. -campaign.add_measurements(recommendation) +campaign.add_measurements(measurements) print("\n\nRecommended experiments with measured values: ") -print(recommendation) +print(measurements) diff --git a/examples/Transfer_Learning/backtesting.py b/examples/Transfer_Learning/backtesting.py index 2c0374185..b44c6e448 100644 --- a/examples/Transfer_Learning/backtesting.py +++ b/examples/Transfer_Learning/backtesting.py @@ -16,14 +16,14 @@ import pandas as pd import seaborn as sns from botorch.test_functions.synthetic import Hartmann +from torch import Tensor from baybe import Campaign -from baybe.objectives import SingleTargetObjective from baybe.parameters import NumericalDiscreteParameter, TaskParameter from baybe.searchspace import SearchSpace from baybe.simulation import simulate_scenarios, simulate_transfer_learning from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper +from baybe.utils.dataframe import label_input_and_output_columns from baybe.utils.plotting import create_example_plots ### Settings @@ -44,7 +44,8 @@ # The corresponding [Objective](baybe.objective.Objective) # is created as follows: -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) +target = NumericalTarget(name="Target", mode="MIN") +objective = target.to_objective() ### Creating the Search Space @@ -91,16 +92,19 @@ # and vice versa. The used model is of course not aware of this relationship but # needs to infer it from the data gathered during the optimization process. +wrapper = label_input_and_output_columns( + [p.name for p in discrete_params], [target.name], use_torch=True +) + -def shifted_hartmann(*x: float) -> float: - """Calculate a shifted, scaled and noisy variant of the Hartman function.""" - noised_hartmann = Hartmann(dim=DIMENSION, noise_std=0.15) - return 2.5 * botorch_function_wrapper(noised_hartmann)(x) + 3.25 +def shifted_hartmann(x: Tensor, /) -> Tensor: + """Calculate a shifted, scaled and noisy variant of the Hartmann function.""" + return 2.5 * Hartmann(dim=DIMENSION, noise_std=0.15)(x) + 3.25 test_functions = { - "Hartmann": botorch_function_wrapper(Hartmann(dim=DIMENSION)), - "Shifted": shifted_hartmann, + "Hartmann": wrapper(Hartmann(dim=DIMENSION)), + "Shifted": wrapper(shifted_hartmann), } ### Generating Lookup Tables @@ -116,7 +120,7 @@ def shifted_hartmann(*x: float) -> float: lookups: dict[str, pd.DataFrame] = {} for function_name, function in test_functions.items(): lookup = pd.DataFrame({f"x{d}": grid_d.ravel() for d, grid_d in enumerate(grid)}) - lookup["Target"] = tuple(lookup.apply(function, axis=1)) + lookup = pd.concat([lookup, function(lookup)], axis=1) lookup["Function"] = function_name lookups[function_name] = lookup lookup = pd.concat([lookups["Hartmann"], lookups["Shifted"]]).reset_index() diff --git a/examples/Transfer_Learning/basic_transfer_learning.py b/examples/Transfer_Learning/basic_transfer_learning.py index e5d849334..7883659ae 100644 --- a/examples/Transfer_Learning/basic_transfer_learning.py +++ b/examples/Transfer_Learning/basic_transfer_learning.py @@ -17,12 +17,11 @@ from botorch.test_functions.synthetic import Hartmann from baybe import Campaign -from baybe.objectives import SingleTargetObjective from baybe.parameters import NumericalDiscreteParameter, TaskParameter from baybe.searchspace import SearchSpace from baybe.simulation import simulate_scenarios from baybe.targets import NumericalTarget -from baybe.utils.botorch_wrapper import botorch_function_wrapper +from baybe.utils.dataframe import label_input_and_output_columns from baybe.utils.plotting import create_example_plots ### Settings @@ -43,7 +42,8 @@ # The corresponding [Objective](baybe.objective.Objective) # is created as follows: -objective = SingleTargetObjective(target=NumericalTarget(name="Target", mode="MIN")) +target = NumericalTarget(name="Target", mode="MIN") +objective = target.to_objective() ### Creating the Searchspace @@ -96,11 +96,13 @@ # noise. The used model is of course not aware of this relationship but needs to infer # it from the data gathered during the optimization process. +wrapper = label_input_and_output_columns( + [p.name for p in discrete_params], [target.name], use_torch=True +) + test_functions = { - "Test_Function": botorch_function_wrapper(Hartmann(dim=DIMENSION)), - "Training_Function": botorch_function_wrapper( - Hartmann(dim=DIMENSION, negate=True, noise_std=0.15) - ), + "Test_Function": wrapper(Hartmann(dim=DIMENSION)), + "Training_Function": wrapper(Hartmann(dim=DIMENSION, negate=True, noise_std=0.15)), } # (Lookup)= @@ -117,7 +119,7 @@ lookups: dict[str, pd.DataFrame] = {} for function_name, function in test_functions.items(): lookup = pd.DataFrame({f"x{d}": grid_d.ravel() for d, grid_d in enumerate(grid)}) - lookup["Target"] = lookup.apply(function, axis=1) + lookup = pd.concat([lookup, function(lookup)], axis=1) lookup["Function"] = function_name lookups[function_name] = lookup lookup_training_task = lookups["Training_Function"] From f2a0157d5f8c2b989f4ba11b20f8235ea7943e98 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 13:11:57 +0100 Subject: [PATCH 17/32] Add return type to decorator utility --- baybe/utils/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 640eaacb3..3d3be7cfa 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -685,7 +685,9 @@ def filter_df( def label_input_and_output_columns( input_labels: Sequence[str], output_labels: Sequence[str], use_torch: bool = False -) -> Callable: +) -> Callable[ + [Callable[[_ArrayLike], _ArrayLike]], Callable[[pd.DataFrame], pd.DataFrame] +]: """Create a decorator for labeling the inputs and outputs of array-based callables. The decorator transforms a callable designed to work with unlabelled arrays such From 44bf262421f78f2f76efa0a44e549a965ce87d0a Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 13:27:16 +0100 Subject: [PATCH 18/32] Increase smoke test problem dimension --- examples/Backtesting/custom_analytical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Backtesting/custom_analytical.py b/examples/Backtesting/custom_analytical.py index 3d16abed9..2381a509b 100644 --- a/examples/Backtesting/custom_analytical.py +++ b/examples/Backtesting/custom_analytical.py @@ -29,7 +29,7 @@ BATCH_SIZE = 1 N_MC_ITERATIONS = 2 if SMOKE_TEST else 20 N_DOE_ITERATIONS = 2 if SMOKE_TEST else 30 -DIMENSION = 1 if SMOKE_TEST else 10 +DIMENSION = 2 if SMOKE_TEST else 10 BOUNDS = (-1, 1) ### Defining the Optimization Problem From 0ce6171b1caccd77c27ca89b3eb5d86ea4019198 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 13:48:18 +0100 Subject: [PATCH 19/32] Fix sphinx references --- docs/userguide/recommenders.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/userguide/recommenders.md b/docs/userguide/recommenders.md index 4b8bed542..4c7f690a3 100644 --- a/docs/userguide/recommenders.md +++ b/docs/userguide/recommenders.md @@ -46,11 +46,15 @@ for various acquisition functions. spaces, as it does gradient-based optimization in the continuous part of the space while exhaustively evaluating configurations of the discrete subspace. You can customize this behavior to only sample a certain percentage of the discrete subspace via the - `sample_percentage` attribute and to choose different sampling algorithms via the - `hybrid_sampler` attribute. + {attr}`~baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.sampling_percentage` + argument and to choose different sampling algorithms via the + {attr}`~baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.hybrid_sampler` + argument. The gradient-based optimization part can also further be controlled by the - `n_restarts` and `n_raw_samples` keywords. For details, please refer + {attr}`~baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.n_restarts` and + {attr}`~baybe.recommenders.pure.bayesian.botorch.BotorchRecommender.n_raw_samples` + arguments. For details, please refer to [BotorchRecommender](baybe.recommenders.pure.bayesian.botorch.BotorchRecommender). * The **[`NaiveHybridSpaceRecommender`](baybe.recommenders.naive.NaiveHybridSpaceRecommender)** From e6c12e4dd45258be08c808a1a359c65d9d761ed6 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 15:29:27 +0100 Subject: [PATCH 20/32] Revise and extend lookup user guide --- docs/userguide/simulation.md | 97 +++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 17 deletions(-) diff --git a/docs/userguide/simulation.md b/docs/userguide/simulation.md index d98932b3e..bd2df9cd5 100644 --- a/docs/userguide/simulation.md +++ b/docs/userguide/simulation.md @@ -1,6 +1,6 @@ # Simulation -BayBE offers multiple functionalities to "simulate" experimental campaigns with a given lookup mechanism. This user guide briefly introduces how to use the methods available in our [simulation submodule](baybe.simulation). +BayBE offers multiple functionalities to "simulate" experimental campaigns with a given lookup mechanism. This user guide briefly introduces how to use the methods available in our [simulation subpackage](baybe.simulation). For a wide variety of applications of this functionality, we refer to the corresponding [examples](../../examples/Backtesting/Backtesting). @@ -13,26 +13,86 @@ Thus, "simulation" means investigating what experimental trajectory we would hav 2. It can refer to the simulation of an *actual* DOE loop, i.e., recommending experiments and retrieving the corresponding measurements, where the loop closure is realized in the form of a callable (black-box) function that can be queried during the optimization to provide target values. Such a callable could for instance be a simple analytical function or a numerical solver of a set of differential equations that describe a physical system. -## The Lookup Functionality +## The Lookup Mechanism -In BayBE, the simulation submodule allows a wide range of use cases and can even be used for "oracle predictions". -This is enabled by the proper use of the `lookup` functionality, which allows to either use fixed data sets, analytical functions, and general callbacks for retrieving target function values. +BayBE's simulation package enables a wide range of use cases and can even be used for "oracle predictions". +This is made possible through the flexible use of `lookup` mechanisms, which act as the loop-closing element of an optimization loop. -All functions require a `lookup` which is used to close the loop and return target values for points in the search space. -It can be provided in the form of a dataframe or a `Callable`. +Lookups can be provided in a variety of ways, by using fixed data sets, analytical functions, or any other form of black-box callable. +In all cases, their role is the same: to retrieve target values for parameter configurations suggested by the recommendation engine. -```{note} -Technically, the `lookup` can also be `None`. This results in the simulation producing random results which is not discussed further. +### Using a `Callable` + +Using a `Callable` is the most general way to provide a lookup mechanism. +Any callable is a suitable lookup as long as it accepts a dataframe containing parameter configurations and returns the corresponding target values. +More specifically: +- The input is expected to be a dataframe whose column names contain the parameter names and whose rows represent valid parameter configurations. +- The returned output must be a dataframe whose column names contain the target names and whose rows represent valid target values. +- The indices of the input and output dataframes must match. + +An example might look like this: +```python +import pandas as pd + +from baybe.targets import NumericalTarget +from baybe.parameters import NumericalContinuousParameter +from baybe.searchspace import SearchSpace + + +searchspace = SearchSpace.from_product( + [ + NumericalContinuousParameter("p1", [0, 1]), + NumericalContinuousParameter("p2", [-1, 1]), + ] +) +objective = NumericalTarget("t1", "MAX").to_objective() + +def lookup(df: pd.DataFrame) -> pd.DataFrame: + """Map parameter configurations to target values.""" + return pd.DataFrame( + {"t1": df["p1"] ** 2, "t2": df[["p1", "p2"]].sum(axis=1)}, + index=df.index, + ) + +lookup(searchspace.continuous.sample_uniform(10)) ``` ### Using a Dataframe -When choosing a dataframe, it needs to contain parameter combinations and their target results. -To make sure that the backtest produces a realistic assessment of the performance, all possible parameter combinations should be measured and present in the dataframe. -However, this is an unrealistic assumption for most applications as it is typically not the case that all possible parameter combinations have been measured prior to the optimization. -As a consequence, it might well be the case that a provided dataframe contains the measurements of only some parameter configurations while a majority of combinations is not present. -For this case, BayBE offers different ways of handling such "missing" values. -This behavior is configured using the `impute_mode` keyword and provides the following possible choices: +When dealing with discrete search spaces, it is also possible to provide the lookup values in a tabular representation using a dataframe. +To be a valid lookup, the dataframe must have columns corresponding to all parameters and targets in the modeled domain. + +An example might look as follows: +```python +import pandas as pd + +from baybe.parameters import NumericalDiscreteParameter +from baybe.searchspace import SearchSpace +from baybe.targets import NumericalTarget + +searchspace = SearchSpace.from_product( + [ + NumericalDiscreteParameter("p1", [0, 1, 2, 3]), + NumericalDiscreteParameter("p2", [1, 10, 100, 1000]), + ] +) +objective = NumericalTarget("t", "MAX").to_objective() + +lookup = pd.DataFrame.from_records( + [ + {"p1": 0, "p2": 100, "t": 23}, + {"p1": 2, "p2": 10, "t": 5}, + {"p1": 3, "p2": 1000, "t": 56}, + ] +) +``` + +Ideally, all possible parameter combinations should be measured and represented in the data frame to ensure that a backtesting simulation produces a realistic performance assessment. +However, this is an unrealistic assumption for most applications because search spaces are oftentimes exceedingly large. +As a consequence, it may well be the case that a provided dataframe contains the measurements of only some parameter configurations while the majority of combinations is not present. + +For this situation, BayBE offers several ways to handle such "missing" targets. +The behavior is configured using the `impute_mode` keyword of the respective simulation function, which offers the following options for handling missing values: - ``"error"``: An error will be thrown. - ``"worst"``: Imputation uses the worst available value for each target. - ``"best"``: Imputation uses the best available value for each target. @@ -40,9 +100,12 @@ This behavior is configured using the `impute_mode` keyword and provides the fol - ``"random"``: A random row will be used as lookup. - ``"ignore"``: The search space is stripped before recommendations are made so that unmeasured experiments will not be recommended. -### Using a `Callable` +### Using `None` + +When testing code, it can sometimes be helpful to have an "arbitrary" lookup mechanism available without having to craft a custom one. +An example of when this is useful is when evaluating the actual lookup is too expensive and results in too long turnaround times (for instance, when the lookup is implemented by running complex code such as a computer simulation). +In these situations, using `None` as lookup can save valuable development time, which invokes the {func}`~baybe.utils.dataframe.add_fake_measurements` utility behind the scenes to generate random target values for any given domain. -The `Callable` needs to return the target values for any given parameter combination. The only requirement that BayBE imposes on using a `Callable` as a lookup mechanism is thus that it returns either a float or a tuple of floats and to accept an arbitrary number of floats as input. ## Simulating a Single Experiment @@ -67,7 +130,7 @@ results = simulate_experiment( ) ~~~ -This function returns a dataframe that contains the results. For details on the columns of this dataframe as well as the dataframes returned by the other functions discussed here, we refer to the documentation of the submodule [here](baybe.simulation). +This function returns a dataframe that contains the results. For details on the columns of this dataframe as well as the dataframes returned by the other functions discussed here, we refer to the documentation of the subpackage [here](baybe.simulation). ## Simulating Multiple Scenarios From 9689e27dcca8e4ccf05b68e519cf92c32fe673d0 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 15:48:09 +0100 Subject: [PATCH 21/32] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 75559196b..aba47d476 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `benchmarks` subpackage for defining and running performance tests – `Campaign.toggle_discrete_candidates` to dynamically in-/exclude discrete candidates - `filter_df` utility for filtering dataframe content +- `label_input_and_output_columns` decorator to create lookups from array-based + callables - `DiscreteConstraint.get_valid` to conveniently access valid candidates - Functionality for persisting benchmarking results on S3 from a manual pipeline run From fbe3d69c80e7b4aded475e6087380f1d8ad59daa Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 15:50:32 +0100 Subject: [PATCH 22/32] Fix example section heading --- examples/Backtesting/custom_analytical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Backtesting/custom_analytical.py b/examples/Backtesting/custom_analytical.py index 2381a509b..2ae17ffd4 100644 --- a/examples/Backtesting/custom_analytical.py +++ b/examples/Backtesting/custom_analytical.py @@ -74,7 +74,7 @@ def blackbox(df: pd.DataFrame, /) -> pd.DataFrame: recommender=RandomRecommender(), ) -### Running the Optimization Loop +### Running the Simulation Loop # Next, we simulate both scenarios using the # {func}`~baybe.simulation.scenarios.simulate_scenarios` utility, From 10e344672b9bb4c5f9bf215649c25a9e42f3dbe6 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Mon, 16 Dec 2024 16:25:50 +0100 Subject: [PATCH 23/32] Drop botorch_wrapper module from test configuration --- .coveragerc | 1 - tests/test_imports.py | 1 - 2 files changed, 2 deletions(-) diff --git a/.coveragerc b/.coveragerc index 40692fe48..0925b1cc1 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,5 +2,4 @@ omit = baybe/utils/plotting.py baybe/utils/random.py - baybe/utils/botorch_wrapper.py baybe/simulation/* \ No newline at end of file diff --git a/tests/test_imports.py b/tests/test_imports.py index 227410297..3e2980428 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -60,7 +60,6 @@ def test_imports(module: str): "torch": [ "baybe.acquisition.partial", "baybe.surrogates._adapter", - "baybe.utils.botorch_wrapper", "baybe.utils.torch", ], } From fe3a36d3446c80bdfa08a1fc96fb3f6191238649 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 19 Dec 2024 13:20:42 +0100 Subject: [PATCH 24/32] Use explicit value assignment for dataframe update Co-authored-by: Martin Fitzner <17951239+Scienfitz@users.noreply.github.com> --- baybe/simulation/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py index 7bda503d4..96e99a436 100644 --- a/baybe/simulation/lookup.py +++ b/baybe/simulation/lookup.py @@ -82,7 +82,7 @@ def _look_up_targets_from_callable( ) -> None: """Look up target values by querying a callable.""" df_targets = lookup(queries) - queries[df_targets.columns] = df_targets + queries[df_targets.columns] = df_targets.values def _look_up_targets_from_dataframe( From 71c9f7422bc9f1ef221aac043aa20d6696f5a934 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 19 Dec 2024 13:35:55 +0100 Subject: [PATCH 25/32] Remove unnecessary target from user guide example --- docs/userguide/simulation.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/userguide/simulation.md b/docs/userguide/simulation.md index bd2df9cd5..f1293caa5 100644 --- a/docs/userguide/simulation.md +++ b/docs/userguide/simulation.md @@ -34,10 +34,9 @@ An example might look like this: ```python import pandas as pd -from baybe.targets import NumericalTarget from baybe.parameters import NumericalContinuousParameter from baybe.searchspace import SearchSpace - +from baybe.targets import NumericalTarget searchspace = SearchSpace.from_product( [ @@ -47,12 +46,11 @@ searchspace = SearchSpace.from_product( ) objective = NumericalTarget("t1", "MAX").to_objective() + def lookup(df: pd.DataFrame) -> pd.DataFrame: """Map parameter configurations to target values.""" - return pd.DataFrame( - {"t1": df["p1"] ** 2, "t2": df[["p1", "p2"]].sum(axis=1)}, - index=df.index, - ) + return pd.DataFrame({"t1": df["p1"] ** 2}, index=df.index) + lookup(searchspace.continuous.sample_uniform(10)) ``` From b64b0ce5584c42f5f8494736f19c7c21e5c68840 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 19 Dec 2024 14:19:08 +0100 Subject: [PATCH 26/32] Apply minor text improvements --- docs/userguide/simulation.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/userguide/simulation.md b/docs/userguide/simulation.md index f1293caa5..72f2e5e29 100644 --- a/docs/userguide/simulation.md +++ b/docs/userguide/simulation.md @@ -16,7 +16,7 @@ Thus, "simulation" means investigating what experimental trajectory we would hav ## The Lookup Mechanism BayBE's simulation package enables a wide range of use cases and can even be used for "oracle predictions". -This is made possible through the flexible use of `lookup` mechanisms, which act as the loop-closing element of an optimization loop. +This is made possible through the flexible use of lookup mechanisms, which act as the loop-closing element of an optimization loop. Lookups can be provided in a variety of ways, by using fixed data sets, analytical functions, or any other form of black-box callable. In all cases, their role is the same: to retrieve target values for parameter configurations suggested by the recommendation engine. @@ -24,7 +24,7 @@ In all cases, their role is the same: to retrieve target values for parameter co ### Using a `Callable` Using a `Callable` is the most general way to provide a lookup mechanism. -Any callable is a suitable lookup as long as it accepts a dataframe containing parameter configurations and returns the corresponding target values. +Any `Callable` is a suitable lookup as long as it accepts a dataframe containing parameter configurations and returns the corresponding target values. More specifically: - The input is expected to be a dataframe whose column names contain the parameter names and whose rows represent valid parameter configurations. - The returned output must be a dataframe whose column names contain the target names and whose rows represent valid target values. @@ -85,9 +85,9 @@ lookup = pd.DataFrame.from_records( ) ``` -Ideally, all possible parameter combinations should be measured and represented in the data frame to ensure that a backtesting simulation produces a realistic performance assessment. +Ideally, all possible parameter combinations should be measured and represented in the dataframe to ensure that a backtesting simulation produces a realistic performance assessment. However, this is an unrealistic assumption for most applications because search spaces are oftentimes exceedingly large. -As a consequence, it may well be the case that a provided dataframe contains the measurements of only some parameter configurations while the majority of combinations is not present. +As a consequence, it may well be the case that a provided dataframe contains the measurements of only some parameter configurations while the majority of combinations is not present (like in the example above). For this situation, BayBE offers several ways to handle such "missing" targets. The behavior is configured using the `impute_mode` keyword of the respective simulation function, which offers the following options for handling missing values: From b1d837337e332037c7839e11f636fcf9a59b9b19 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 19 Dec 2024 15:14:37 +0100 Subject: [PATCH 27/32] Reference impute mode options via keyword link in admonition --- .lockfiles/py310-dev.lock | 4 ++++ docs/conf.py | 1 + docs/userguide/simulation.md | 17 +++++++---------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.lockfiles/py310-dev.lock b/.lockfiles/py310-dev.lock index 46729f7c0..39df21abe 100644 --- a/.lockfiles/py310-dev.lock +++ b/.lockfiles/py310-dev.lock @@ -135,6 +135,7 @@ docutils==0.21.2 # myst-parser # pybtex-docutils # sphinx + # sphinx-paramlinks # sphinxcontrib-bibtex e3fp==1.2.5 # via scikit-fingerprints @@ -856,6 +857,7 @@ sphinx==8.1.3 # sphinx-autodoc-typehints # sphinx-basic-ng # sphinx-copybutton + # sphinx-paramlinks # sphinxcontrib-bibtex sphinx-autodoc-typehints==2.5.0 # via baybe (pyproject.toml) @@ -863,6 +865,8 @@ sphinx-basic-ng==1.0.0b2 # via furo sphinx-copybutton==0.5.2 # via baybe (pyproject.toml) +sphinx-paramlinks==0.6.0 + # via baybe (pyproject.toml) sphinxcontrib-applehelp==1.0.8 # via sphinx sphinxcontrib-bibtex==2.6.2 diff --git a/docs/conf.py b/docs/conf.py index 403322ffe..c778a48a4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -82,6 +82,7 @@ "sphinx_autodoc_typehints", # Proper typehints "sphinx_copybutton", # Copy button for code blocks "sphinxcontrib.bibtex", # Bibtex support + "sphinx_paramlinks", # Links to arguments of callables ] bibtex_bibfiles = ["references.bib"] myst_enable_extensions = ["dollarmath"] # Enables Latex-like math in markdown files diff --git a/docs/userguide/simulation.md b/docs/userguide/simulation.md index 72f2e5e29..dd2655b16 100644 --- a/docs/userguide/simulation.md +++ b/docs/userguide/simulation.md @@ -85,18 +85,15 @@ lookup = pd.DataFrame.from_records( ) ``` -Ideally, all possible parameter combinations should be measured and represented in the dataframe to ensure that a backtesting simulation produces a realistic performance assessment. +```{admonition} Missing Lookup Values +:class: tip +Ideally, all possible parameter combinations should be measured and represented in the dataframe to ensure that a backtesting simulation produces a realistic assessment of performance. However, this is an unrealistic assumption for most applications because search spaces are oftentimes exceedingly large. As a consequence, it may well be the case that a provided dataframe contains the measurements of only some parameter configurations while the majority of combinations is not present (like in the example above). - -For this situation, BayBE offers several ways to handle such "missing" targets. -The behavior is configured using the `impute_mode` keyword of the respective simulation function, which offers the following options for handling missing values: -- ``"error"``: An error will be thrown. -- ``"worst"``: Imputation uses the worst available value for each target. -- ``"best"``: Imputation uses the best available value for each target. -- ``"mean"``: Imputation uses the mean value for each target. -- ``"random"``: A random row will be used as lookup. -- ``"ignore"``: The search space is stripped before recommendations are made so that unmeasured experiments will not be recommended. +To address this issue, BayBE provides various methods for managing these “missing” targets, +which can be configured using the {paramref}`~baybe.simulation.lookup.look_up_targets.impute_mode` +keyword of the respective simulation function. +``` ### Using `None` From f32207da87107f88574ac88a0f0d24540081b652 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 19 Dec 2024 15:27:38 +0100 Subject: [PATCH 28/32] Rewrite blackbox function docstring --- examples/Backtesting/custom_analytical.py | 2 +- ruff.toml | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/Backtesting/custom_analytical.py b/examples/Backtesting/custom_analytical.py index 2ae17ffd4..e3ee4de0d 100644 --- a/examples/Backtesting/custom_analytical.py +++ b/examples/Backtesting/custom_analytical.py @@ -49,7 +49,7 @@ def blackbox(df: pd.DataFrame, /) -> pd.DataFrame: - """Calculate the sum of squares.""" + """A callable whose internal logic is unknown to the algorithm.""" return (df[[p.name for p in parameters]] ** 2).sum(axis=1).to_frame(target.name) diff --git a/ruff.toml b/ruff.toml index 230d8cb7a..c93987fc3 100644 --- a/ruff.toml +++ b/ruff.toml @@ -7,7 +7,9 @@ line-length = 88 # Maximum line length # Error regarding too long lines "E501", # Missing module docstrings - "D100" + "D100", + # In the documentation, imperative mood can be a bit too restrictive + "D401", ] "baybe/utils/__init__.py" = ["F401","F403"] "baybe/strategies/__init__.py" = ["F401"] From 6b201d4ac0419b2e6afba1385657cfc451784659 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Thu, 19 Dec 2024 15:30:02 +0100 Subject: [PATCH 29/32] Rename custom_analytical.py to custom_blackbox.py --- .../Backtesting/{custom_analytical.py => custom_blackbox.py} | 2 +- .../{custom_analytical_dark.svg => custom_blackbox_dark.svg} | 0 .../{custom_analytical_light.svg => custom_blackbox_light.svg} | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename examples/Backtesting/{custom_analytical.py => custom_blackbox.py} (98%) rename examples/Backtesting/{custom_analytical_dark.svg => custom_blackbox_dark.svg} (100%) rename examples/Backtesting/{custom_analytical_light.svg => custom_blackbox_light.svg} (100%) diff --git a/examples/Backtesting/custom_analytical.py b/examples/Backtesting/custom_blackbox.py similarity index 98% rename from examples/Backtesting/custom_analytical.py rename to examples/Backtesting/custom_blackbox.py index e3ee4de0d..eef27b59f 100644 --- a/examples/Backtesting/custom_analytical.py +++ b/examples/Backtesting/custom_blackbox.py @@ -104,4 +104,4 @@ def blackbox(df: pd.DataFrame, /) -> pd.DataFrame: y="Target_CumBest", hue="Scenario", ) -create_example_plots(ax=ax, base_name="custom_analytical") +create_example_plots(ax=ax, base_name="custom_blackbox") diff --git a/examples/Backtesting/custom_analytical_dark.svg b/examples/Backtesting/custom_blackbox_dark.svg similarity index 100% rename from examples/Backtesting/custom_analytical_dark.svg rename to examples/Backtesting/custom_blackbox_dark.svg diff --git a/examples/Backtesting/custom_analytical_light.svg b/examples/Backtesting/custom_blackbox_light.svg similarity index 100% rename from examples/Backtesting/custom_analytical_light.svg rename to examples/Backtesting/custom_blackbox_light.svg From 798a59ada55a9419a070912de4d98185a1e8103a Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 20 Dec 2024 09:31:41 +0100 Subject: [PATCH 30/32] Rename decorator utility --- CHANGELOG.md | 3 +-- baybe/utils/dataframe.py | 10 +++++++--- examples/Constraints_Continuous/hybrid_space.py | 4 ++-- examples/Constraints_Continuous/linear_constraints.py | 4 ++-- examples/Custom_Hooks/probability_of_improvement.py | 4 ++-- examples/Searchspaces/discrete_space.py | 4 ++-- examples/Searchspaces/hybrid_space.py | 4 ++-- examples/Transfer_Learning/backtesting.py | 4 ++-- examples/Transfer_Learning/basic_transfer_learning.py | 4 ++-- 9 files changed, 22 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aba47d476..5a04d1fb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `benchmarks` subpackage for defining and running performance tests – `Campaign.toggle_discrete_candidates` to dynamically in-/exclude discrete candidates - `filter_df` utility for filtering dataframe content -- `label_input_and_output_columns` decorator to create lookups from array-based - callables +- `arrays_to_dataframes` decorator to create lookups from array-based callables - `DiscreteConstraint.get_valid` to conveniently access valid candidates - Functionality for persisting benchmarking results on S3 from a manual pipeline run diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 3d3be7cfa..ba19578da 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -683,13 +683,17 @@ def filter_df( return out -def label_input_and_output_columns( - input_labels: Sequence[str], output_labels: Sequence[str], use_torch: bool = False +def arrays_to_dataframes( + input_labels: Sequence[str], + output_labels: Sequence[str], + /, + use_torch: bool = False, ) -> Callable[ [Callable[[_ArrayLike], _ArrayLike]], Callable[[pd.DataFrame], pd.DataFrame] ]: - """Create a decorator for labeling the inputs and outputs of array-based callables. + """Make a decorator for labeling the input/output columns of array-based callables. + Useful for creating parameter-to-target lookups from array-based logic. The decorator transforms a callable designed to work with unlabelled arrays such that it can operate with dataframes instead. The original callable is expected to accept and return two-dimensional arrays. When decorated, the callable accepts and diff --git a/examples/Constraints_Continuous/hybrid_space.py b/examples/Constraints_Continuous/hybrid_space.py index f88d873aa..98fa2a086 100644 --- a/examples/Constraints_Continuous/hybrid_space.py +++ b/examples/Constraints_Continuous/hybrid_space.py @@ -25,7 +25,7 @@ from baybe.parameters import NumericalContinuousParameter, NumericalDiscreteParameter from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.dataframe import label_input_and_output_columns +from baybe.utils.dataframe import arrays_to_dataframes ### Defining the test function @@ -93,7 +93,7 @@ ### Wrap the test function as a dataframe-based lookup callable -lookup = label_input_and_output_columns( +lookup = arrays_to_dataframes( [p.name for p in parameters], [target.name], use_torch=True )(TestFunction) diff --git a/examples/Constraints_Continuous/linear_constraints.py b/examples/Constraints_Continuous/linear_constraints.py index 928b0fb93..7eba01d56 100644 --- a/examples/Constraints_Continuous/linear_constraints.py +++ b/examples/Constraints_Continuous/linear_constraints.py @@ -22,7 +22,7 @@ from baybe.parameters import NumericalContinuousParameter from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.dataframe import label_input_and_output_columns +from baybe.utils.dataframe import arrays_to_dataframes ### Defining the test function @@ -79,7 +79,7 @@ ### Wrap the test function as a dataframe-based lookup callable -lookup = label_input_and_output_columns( +lookup = arrays_to_dataframes( [p.name for p in parameters], [target.name], use_torch=True )(TestFunction) diff --git a/examples/Custom_Hooks/probability_of_improvement.py b/examples/Custom_Hooks/probability_of_improvement.py index 6cdd96687..55a0659a6 100644 --- a/examples/Custom_Hooks/probability_of_improvement.py +++ b/examples/Custom_Hooks/probability_of_improvement.py @@ -37,7 +37,7 @@ from baybe.surrogates import GaussianProcessSurrogate from baybe.targets import NumericalTarget from baybe.utils.basic import register_hooks -from baybe.utils.dataframe import label_input_and_output_columns, to_tensor +from baybe.utils.dataframe import arrays_to_dataframes, to_tensor from baybe.utils.plotting import create_example_plots from baybe.utils.random import set_random_seed @@ -141,7 +141,7 @@ def extract_pi( # Now, we perform a couple of experimental iterations with the active hook: -lookup = label_input_and_output_columns( +lookup = arrays_to_dataframes( [p.name for p in discrete_params], [target.name], use_torch=True )(test_function) diff --git a/examples/Searchspaces/discrete_space.py b/examples/Searchspaces/discrete_space.py index 9f87a783d..c0a4f2ece 100644 --- a/examples/Searchspaces/discrete_space.py +++ b/examples/Searchspaces/discrete_space.py @@ -13,7 +13,7 @@ from baybe.parameters import NumericalDiscreteParameter from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.dataframe import label_input_and_output_columns +from baybe.utils.dataframe import arrays_to_dataframes ### Defining the test function @@ -89,7 +89,7 @@ # Evaluate the test function. -lookup = label_input_and_output_columns( +lookup = arrays_to_dataframes( [p.name for p in parameters], [target.name], use_torch=True )(TestFunction) diff --git a/examples/Searchspaces/hybrid_space.py b/examples/Searchspaces/hybrid_space.py index d2556413d..bceb6abaf 100644 --- a/examples/Searchspaces/hybrid_space.py +++ b/examples/Searchspaces/hybrid_space.py @@ -18,7 +18,7 @@ from baybe.recommenders import NaiveHybridSpaceRecommender, TwoPhaseMetaRecommender from baybe.searchspace import SearchSpace from baybe.targets import NumericalTarget -from baybe.utils.dataframe import label_input_and_output_columns +from baybe.utils.dataframe import arrays_to_dataframes ### Defining the test function and the hybrid dimensions @@ -104,7 +104,7 @@ ### Wrap the test function as a dataframe-based lookup callable -lookup = label_input_and_output_columns( +lookup = arrays_to_dataframes( searchspace.parameter_names, [target.name], use_torch=True )(TestFunction) diff --git a/examples/Transfer_Learning/backtesting.py b/examples/Transfer_Learning/backtesting.py index b44c6e448..3d4bca784 100644 --- a/examples/Transfer_Learning/backtesting.py +++ b/examples/Transfer_Learning/backtesting.py @@ -23,7 +23,7 @@ from baybe.searchspace import SearchSpace from baybe.simulation import simulate_scenarios, simulate_transfer_learning from baybe.targets import NumericalTarget -from baybe.utils.dataframe import label_input_and_output_columns +from baybe.utils.dataframe import arrays_to_dataframes from baybe.utils.plotting import create_example_plots ### Settings @@ -92,7 +92,7 @@ # and vice versa. The used model is of course not aware of this relationship but # needs to infer it from the data gathered during the optimization process. -wrapper = label_input_and_output_columns( +wrapper = arrays_to_dataframes( [p.name for p in discrete_params], [target.name], use_torch=True ) diff --git a/examples/Transfer_Learning/basic_transfer_learning.py b/examples/Transfer_Learning/basic_transfer_learning.py index 7883659ae..ad095d95f 100644 --- a/examples/Transfer_Learning/basic_transfer_learning.py +++ b/examples/Transfer_Learning/basic_transfer_learning.py @@ -21,7 +21,7 @@ from baybe.searchspace import SearchSpace from baybe.simulation import simulate_scenarios from baybe.targets import NumericalTarget -from baybe.utils.dataframe import label_input_and_output_columns +from baybe.utils.dataframe import arrays_to_dataframes from baybe.utils.plotting import create_example_plots ### Settings @@ -96,7 +96,7 @@ # noise. The used model is of course not aware of this relationship but needs to infer # it from the data gathered during the optimization process. -wrapper = label_input_and_output_columns( +wrapper = arrays_to_dataframes( [p.name for p in discrete_params], [target.name], use_torch=True ) From 3d1c0ebb24a8f9787db6c57fddad32982b613e14 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 20 Dec 2024 10:11:11 +0100 Subject: [PATCH 31/32] Add admonition mentioning decorator utility --- docs/userguide/simulation.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/userguide/simulation.md b/docs/userguide/simulation.md index dd2655b16..8581bd364 100644 --- a/docs/userguide/simulation.md +++ b/docs/userguide/simulation.md @@ -55,6 +55,30 @@ def lookup(df: pd.DataFrame) -> pd.DataFrame: lookup(searchspace.continuous.sample_uniform(10)) ``` +````{admonition} Array-Based Callables +:class: tip +If you already have a lookup callable available in an array-based format (for instance, +if your lookup values are generated using third-party code that works with array inputs +and outputs), you can effortlessly convert this callable into the required +dataframe-based format by applying our +{func}`~baybe.utils.dataframe.arrays_to_dataframes` decorator. + +For example, the above lookup can be equivalently created as follows: +```python +import numpy as np + +from baybe.utils.dataframe import arrays_to_dataframes + + +@arrays_to_dataframes(["p1"], ["t1"]) +def lookup(array: np.ndarray) -> np.ndarray: + """The same lookup function in array logic.""" + return array**2 +``` + +```` + + ### Using a Dataframe When dealing with discrete search spaces, it is also possible to provide the lookup values in a tabular representation using a dataframe. From a6cc05a72d8d6a8169e1c84d94233218b9c02ed5 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 20 Dec 2024 10:30:56 +0100 Subject: [PATCH 32/32] Refine filter_df logic * Rename filter argument to to_keep * Make target dataframe positional-only * Revise docstring --- baybe/utils/dataframe.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index ba19578da..61cc4f051 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -607,7 +607,7 @@ def get_transform_objects( def filter_df( - df: pd.DataFrame, filter: pd.DataFrame, complement: bool = False + df: pd.DataFrame, /, to_keep: pd.DataFrame, complement: bool = False ) -> pd.DataFrame: """Filter a dataframe based on a second dataframe defining filtering conditions. @@ -616,9 +616,11 @@ def filter_df( Args: df: The dataframe to be filtered. - filter: The dataframe defining the filtering conditions. + to_keep: The dataframe defining the filtering conditions. By default + (see ``complement`` argument), it defines the rows to be kept in the sense + of an inner join. complement: If ``False``, the filter dataframe determines the rows to be kept - (i.e. selection via regular join). If ``True``, the filtering mechanism is + (i.e. selection via inner join). If ``True``, the filtering mechanism is inverted so that the complement set of rows is kept (i.e. selection via anti-join). @@ -661,7 +663,7 @@ def filter_df( """ # Handle special case of empty filter - if filter.empty: + if to_keep.empty: return df if complement else pd.DataFrame(columns=df.columns) # Remember original index name @@ -669,7 +671,7 @@ def filter_df( # Identify rows to be dropped out = pd.merge( - df.reset_index(names="_df_index"), filter, how="left", indicator=True + df.reset_index(names="_df_index"), to_keep, how="left", indicator=True ).set_index("_df_index") to_drop = out["_merge"] == ("both" if complement else "left_only")