Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

callbacks: Improve performance of _store_outcomes by ~35x #232

Merged
merged 9 commits into from
Apr 9, 2023
78 changes: 30 additions & 48 deletions ema_workbench/em_framework/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,41 +195,32 @@ def __init__(
reporting_frequency,
log_progress,
)

self.cases = None
self.cases = np.empty((nr_experiments, len(uncertainties) + len(levers)), dtype=object)
self.uncertainty_and_lever_labels = [(entry.name, "") for entry in uncertainties + levers]
self.uncertainties = [u.name for u in uncertainties]
self.levers = [l.name for l in levers]
self.results = {}

# determine data types of parameters
columns = []
dtypes = []

for parameter in self.parameters:
name = parameter.name
dtype = "float"

if isinstance(parameter, BooleanParameter):
dtype = "bool"
elif isinstance(parameter, CategoricalParameter):
dtype = "object"
elif isinstance(parameter, IntegerParameter):
dtype = "int"
columns.append(name)
dtypes.append(dtype)

for name in ["scenario", "policy", "model"]:
columns.append(name)
dtypes.append("object")

self.columns = columns
dtypes.append((parameter.name, dtype))

dtypes.extend(
[
("scenario", "object"),
("policy", "object"),
("model", "object"),
]
)
self.dtypes = dtypes

index = np.arange(nr_experiments)
column_dict = {
name: pd.Series(dtype=dtype, index=index) for name, dtype in zip(columns, dtypes)
}
df = pd.concat(column_dict, axis=1).copy()

self.cases = df
self.cases = np.empty(nr_experiments, dtype=dtypes)

for outcome in self.outcomes:
shape = outcome.shape
Expand All @@ -242,43 +233,36 @@ def _store_case(self, experiment):
policy = experiment.policy
index = experiment.experiment_id

self.cases.at[index, "scenario"] = scenario.name
self.cases.at[index, "policy"] = policy.name
self.cases.at[index, "model"] = experiment.model_name

for k, v in scenario.items():
self.cases.at[index, k] = v

for k, v in policy.items():
self.cases.at[index, k] = v
self.cases[index] = (
tuple([scenario[u] for u in self.uncertainties])
+ tuple([policy[l] for l in self.levers])
+ (scenario.name, policy.name, experiment.model_name)
)

def _store_outcomes(self, case_id, outcomes):
for outcome in self.outcomes:
outcome = outcome.name
_logger.debug(f"storing {outcome}")
outcome_name = outcome.name

try:
outcome_res = outcomes[outcome]
outcome_res = outcomes[outcome_name]
except KeyError:
message = f"{outcome} not specified as outcome in " f"model(s)"
message = f"{outcome_name} not specified as outcome in model(s)"
_logger.debug(message)
else:
try:
self.results[outcome][case_id,] = outcome_res
self.results[outcome_name][case_id,] = outcome_res
except KeyError:
data = np.asarray(outcome_res)

shape = data.shape

if len(shape) > 2:
message = self.shape_error_msg.format(len(shape))
raise ema_exceptions.EMAError(message)

shape = list(shape)
shape.insert(0, self.nr_experiments)

self.results[outcome] = self._setup_outcomes_array(shape, data.dtype)
self.results[outcome][case_id,] = outcome_res
self.results[outcome_name] = self._setup_outcomes_array(shape, data.dtype)
self.results[outcome_name][case_id,] = outcome_res

def __call__(self, experiment, outcomes):
"""
Expand All @@ -293,11 +277,7 @@ def __call__(self, experiment, outcomes):

"""
super().__call__(experiment, outcomes)

# store the case
self._store_case(experiment)

# store outcomes
self._store_outcomes(experiment.experiment_id, outcomes)

def get_results(self):
Expand All @@ -309,18 +289,20 @@ def get_results(self):
_logger.warning("some experiments have failed, returning masked result arrays")
results[k] = v

cases = pd.DataFrame.from_records(self.cases)

# we want to ensure the dtypes for the columns in the experiments dataframe match
# the type of uncertainty. The exception is needed in case their are missing values (i.e. nans).
# nans can only ever be a float.
for name, dtype in zip(self.columns, self.dtypes):
for name, dtype in self.dtypes:
try:
if dtype == "object":
dtype = "category"
self.cases[name] = self.cases[name].astype(dtype)
cases[name] = cases[name].astype(dtype)
except Exception:
pass

return self.cases, results
return cases, results

def _setup_outcomes_array(self, shape, dtype):
array = np.ma.empty(shape, dtype=dtype)
Expand Down
18 changes: 11 additions & 7 deletions test/test_em_framework/test_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,13 @@ def test_store_results(mocker):
# case 5 assert raises KeyError
callback = DefaultCallback(uncs, [], outcomes, nr_experiments=nr_experiments)
model_outcomes = {"some_other_name": np.random.rand(2, 2, 2)}
mock = mocker.patch("ema_workbench.em_framework.callbacks._logger.debug")
mock = mocker.patch(
"ema_workbench.em_framework.callbacks._logger.debug",
autospec=True,
side_effect=lambda *args, **kwargs: print(args, kwargs),
)
callback._store_outcomes(1, model_outcomes)
assert mock.call_count == 2
assert mock.call_count == 1


def test_init():
Expand All @@ -94,9 +98,9 @@ def test_init():
assert callback.reporting_interval == 100
# self.assertEqual(callback.outcomes, outcomes)

names = callback.cases.columns.values.tolist()
names = [name for name, _ in callback.uncertainty_and_lever_labels]
names = set(names)
assert names == {"a", "b", "policy", "model", "scenario"}
assert names == {"a", "b"}

assert "scalar" not in callback.results
assert "timeseries" not in callback.results
Expand All @@ -121,9 +125,9 @@ def test_init():
assert callback.reporting_interval == 250
# self.assertEqual(callback.outcomes, [o.name for o in outcomes])

names = callback.cases.columns.values.tolist()
names = [name for name, _ in callback.uncertainty_and_lever_labels]
names = set(names)
assert names == {"a", "b", "c", "policy", "model", "scenario"}
assert names == {"a", "b", "c"}

assert "scalar" not in callback.results
assert "timeseries" not in callback.results
Expand Down Expand Up @@ -209,7 +213,7 @@ def test_store_cases():

names = experiments.columns.values.tolist()
for name in names:
assert experiments[name][0] == design[name]
assert experiments[name][0] == design.get(name), f"failed for name {name}"


def test_get_results(mocker):
Expand Down