Skip to content

Commit

Permalink
callbacks: Improve performance of _store_outcomes by ~25x (#232)
Browse files Browse the repository at this point in the history
* callbacks: Improve performance of _store_outcomes by ~25x

On a performance profile of the optimized lake model I noticed that the _store_case function took up a large portion of the runtime (46.4% of 5000 runs, or 9535ms).

This commit optimizes the DefaultCallback class by:
- Replacing Pandas DataFrames with NumPy structured arrays for faster data access and manipulation.
- Using NumPy functions and data structures where possible.
- Reducing unnecessary function calls.

According to profiles, it speeds up `_store_outcomes` by about 25x. When running a very simple model, the whole workbench is now over 2.5x as fast.

**Old**
- Profile: Total time 11.172 ms (48.8%), own time 559 ms (2.4%)
- Now runs about 15.000 iterations of the [simple Python model](https://emaworkbench.readthedocs.io/en/latest/basic_tutorial.html#a-simple-model-in-python) per second.

**New**
- Profile: Total time 424 ms (3.7%), own time 90 ms (0.8%).
- Now runs about 40.000 iterations of the [simple Python model](https://emaworkbench.readthedocs.io/en/latest/basic_tutorial.html#a-simple-model-in-python) per second.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* callbacks: Return a DataFrame instead of a NumPy array

* Return correct DataFrame and update test set

Return a correctly indexed DataFrame and make some small adjustments to the test set. Please review carefully.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* callback: Replace NaN with None

* callbacks: Update _store_case to ensure order of uncertainties and levers

Co-Authored-By: Jan Kwakkel <j.h.kwakkel@tudelft.nl>

* Update callbacks.py

make dtypes an attribute

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jan Kwakkel <j.h.kwakkel@tudelft.nl>
  • Loading branch information
3 people authored Apr 9, 2023
1 parent 081b41f commit a608a2e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 55 deletions.
78 changes: 30 additions & 48 deletions ema_workbench/em_framework/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,41 +195,32 @@ def __init__(
reporting_frequency,
log_progress,
)

self.cases = None
self.cases = np.empty((nr_experiments, len(uncertainties) + len(levers)), dtype=object)
self.uncertainty_and_lever_labels = [(entry.name, "") for entry in uncertainties + levers]
self.uncertainties = [u.name for u in uncertainties]
self.levers = [l.name for l in levers]
self.results = {}

# determine data types of parameters
columns = []
dtypes = []

for parameter in self.parameters:
name = parameter.name
dtype = "float"

if isinstance(parameter, BooleanParameter):
dtype = "bool"
elif isinstance(parameter, CategoricalParameter):
dtype = "object"
elif isinstance(parameter, IntegerParameter):
dtype = "int"
columns.append(name)
dtypes.append(dtype)

for name in ["scenario", "policy", "model"]:
columns.append(name)
dtypes.append("object")

self.columns = columns
dtypes.append((parameter.name, dtype))

dtypes.extend(
[
("scenario", "object"),
("policy", "object"),
("model", "object"),
]
)
self.dtypes = dtypes

index = np.arange(nr_experiments)
column_dict = {
name: pd.Series(dtype=dtype, index=index) for name, dtype in zip(columns, dtypes)
}
df = pd.concat(column_dict, axis=1).copy()

self.cases = df
self.cases = np.empty(nr_experiments, dtype=dtypes)

for outcome in self.outcomes:
shape = outcome.shape
Expand All @@ -242,43 +233,36 @@ def _store_case(self, experiment):
policy = experiment.policy
index = experiment.experiment_id

self.cases.at[index, "scenario"] = scenario.name
self.cases.at[index, "policy"] = policy.name
self.cases.at[index, "model"] = experiment.model_name

for k, v in scenario.items():
self.cases.at[index, k] = v

for k, v in policy.items():
self.cases.at[index, k] = v
self.cases[index] = (
tuple([scenario[u] for u in self.uncertainties])
+ tuple([policy[l] for l in self.levers])
+ (scenario.name, policy.name, experiment.model_name)
)

def _store_outcomes(self, case_id, outcomes):
for outcome in self.outcomes:
outcome = outcome.name
_logger.debug(f"storing {outcome}")
outcome_name = outcome.name

try:
outcome_res = outcomes[outcome]
outcome_res = outcomes[outcome_name]
except KeyError:
message = f"{outcome} not specified as outcome in " f"model(s)"
message = f"{outcome_name} not specified as outcome in model(s)"
_logger.debug(message)
else:
try:
self.results[outcome][case_id,] = outcome_res
self.results[outcome_name][case_id,] = outcome_res
except KeyError:
data = np.asarray(outcome_res)

shape = data.shape

if len(shape) > 2:
message = self.shape_error_msg.format(len(shape))
raise ema_exceptions.EMAError(message)

shape = list(shape)
shape.insert(0, self.nr_experiments)

self.results[outcome] = self._setup_outcomes_array(shape, data.dtype)
self.results[outcome][case_id,] = outcome_res
self.results[outcome_name] = self._setup_outcomes_array(shape, data.dtype)
self.results[outcome_name][case_id,] = outcome_res

def __call__(self, experiment, outcomes):
"""
Expand All @@ -293,11 +277,7 @@ def __call__(self, experiment, outcomes):
"""
super().__call__(experiment, outcomes)

# store the case
self._store_case(experiment)

# store outcomes
self._store_outcomes(experiment.experiment_id, outcomes)

def get_results(self):
Expand All @@ -309,18 +289,20 @@ def get_results(self):
_logger.warning("some experiments have failed, returning masked result arrays")
results[k] = v

cases = pd.DataFrame.from_records(self.cases)

# we want to ensure the dtypes for the columns in the experiments dataframe match
# the type of uncertainty. The exception is needed in case their are missing values (i.e. nans).
# nans can only ever be a float.
for name, dtype in zip(self.columns, self.dtypes):
for name, dtype in self.dtypes:
try:
if dtype == "object":
dtype = "category"
self.cases[name] = self.cases[name].astype(dtype)
cases[name] = cases[name].astype(dtype)
except Exception:
pass

return self.cases, results
return cases, results

def _setup_outcomes_array(self, shape, dtype):
array = np.ma.empty(shape, dtype=dtype)
Expand Down
18 changes: 11 additions & 7 deletions test/test_em_framework/test_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,13 @@ def test_store_results(mocker):
# case 5 assert raises KeyError
callback = DefaultCallback(uncs, [], outcomes, nr_experiments=nr_experiments)
model_outcomes = {"some_other_name": np.random.rand(2, 2, 2)}
mock = mocker.patch("ema_workbench.em_framework.callbacks._logger.debug")
mock = mocker.patch(
"ema_workbench.em_framework.callbacks._logger.debug",
autospec=True,
side_effect=lambda *args, **kwargs: print(args, kwargs),
)
callback._store_outcomes(1, model_outcomes)
assert mock.call_count == 2
assert mock.call_count == 1


def test_init():
Expand All @@ -94,9 +98,9 @@ def test_init():
assert callback.reporting_interval == 100
# self.assertEqual(callback.outcomes, outcomes)

names = callback.cases.columns.values.tolist()
names = [name for name, _ in callback.uncertainty_and_lever_labels]
names = set(names)
assert names == {"a", "b", "policy", "model", "scenario"}
assert names == {"a", "b"}

assert "scalar" not in callback.results
assert "timeseries" not in callback.results
Expand All @@ -121,9 +125,9 @@ def test_init():
assert callback.reporting_interval == 250
# self.assertEqual(callback.outcomes, [o.name for o in outcomes])

names = callback.cases.columns.values.tolist()
names = [name for name, _ in callback.uncertainty_and_lever_labels]
names = set(names)
assert names == {"a", "b", "c", "policy", "model", "scenario"}
assert names == {"a", "b", "c"}

assert "scalar" not in callback.results
assert "timeseries" not in callback.results
Expand Down Expand Up @@ -209,7 +213,7 @@ def test_store_cases():

names = experiments.columns.values.tolist()
for name in names:
assert experiments[name][0] == design[name]
assert experiments[name][0] == design.get(name), f"failed for name {name}"


def test_get_results(mocker):
Expand Down

0 comments on commit a608a2e

Please sign in to comment.