Skip to content

Commit

Permalink
Merge branch 'deephyper' of https://github.com/fmohr/lcdb into deephyper
Browse files Browse the repository at this point in the history
  • Loading branch information
tomviering committed Oct 28, 2024
2 parents 2bc1dc9 + 4f8c4f3 commit 9b227e4
Show file tree
Hide file tree
Showing 10 changed files with 954 additions and 2,344 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,22 @@
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--n_trees', type=int, default=16)
parser.add_argument('--openml_ids', type=int, nargs='+', default=None)
parser.add_argument('--openml_ids', type=int, nargs='+', default=[3, 6])
parser.add_argument('--workflow_name', type=str, default="lcdb.workflow.sklearn.LibLinearWorkflow")
parser.add_argument('--openml_taskid_name', type=str, default="m:openmlid")
parser.add_argument('--output_directory', type=str, default=os.path.expanduser('~/experiments/lcdb'))
parser.add_argument('--output_filetype', type=str, choices=['pdf', 'png'], default='png')
parser.add_argument('--max_load', type=int, default=None)
parser.add_argument('--anchor_values', type=int, nargs='+', default=[128, 512, 2048, -1])
return parser.parse_args()


def numeric_encode(df, config_space):
# https://automl.github.io/ConfigSpace/latest/api/ConfigSpace/configuration_space/
result = np.zeros((len(df), len(config_space.get_hyperparameters())), dtype=float)
result = np.zeros((len(df), len(config_space.values())), dtype=float)

for hyperparameter_name, hyperparameter in config_space.items():
index = config_space.get_idx_by_hyperparameter_name(hyperparameter_name)
index = config_space.index_of[hyperparameter_name]
if isinstance(hyperparameter, ConfigSpace.hyperparameters.NumericalHyperparameter):
result[:, index] = df[hyperparameter_name].to_numpy()
elif isinstance(hyperparameter, ConfigSpace.hyperparameters.Constant):
Expand All @@ -43,92 +44,128 @@ def numeric_encode(df, config_space):
return result


def fanova_on_task(task_results, performance_column_name, curve_data_column, config_space, n_trees):
def fanova_on_task(task_results, performance_column_name, current_anchor_value, config_space, n_trees):
fanova_results = []

# query_confusion_matrix_values = lcdb.analysis.json.QueryMetricValuesFromAnchors("confusion_matrix", split_name="val")
# out = task_results[performance_column_name].apply(query_confusion_matrix_values)
# print(out)
# balanced_error_rate_values_for_config = np.array(
# out.apply(lambda x: list(map(lambda x: 1 - lcdb.analysis.score.balanced_accuracy_from_confusion_matrix(x), x))).to_list())
# print(balanced_error_rate_values_for_config.mean(axis=0))
# print(out)
evaluator = fanova.fanova.fANOVA(
X=numeric_encode(task_results, config_space),
Y=task_results[performance_column_name].to_numpy(),
config_space=config_space,
n_trees=n_trees,
)
for idx, pname in enumerate(config_space.get_hyperparameter_names()):
for idx, pname in enumerate(config_space.keys()):
logging.info('-- hyperparameter %d %s' % (idx, pname))
unique_values = task_results[pname].unique()
logging.info('-- UNIQUE VALUES: %d (%s)' % (len(unique_values), unique_values))
importance = evaluator.quantify_importance([idx])

fanova_results.append(
{
"hyperparameter": pname,
"fanova": importance[(idx,)]["individual importance"],
}
)

fanova_results.append({
"hyperparameter": pname,
"anchor": current_anchor_value,
"variance_contribution": importance[(idx,)]["individual importance"],
})
return fanova_results


def run(args):
fanova_all_results = []
performance_column = "objective"
curve_data_column = "m:json"
performance_column = "final_objective" # make sure to give this a unique name (not same as the "objective" field)
anchor_size_column = "anchor_sizes"
learning_curve_column = "learning_curve_data"

WorkflowClass = lcdb.builder.utils.import_attr_from_module(args.workflow_name)
config_space = WorkflowClass.config_space()
workflow_hyperparameter_mapping = {"p:" + name: name for name in config_space.get_hyperparameter_names()}
workflow_hyperparameter_mapping = {"p:" + name: name for name in config_space.keys()}
id_results = dict()

all_results_all_workflows = lcdb.db.LCDB().query(workflows=[args.workflow_name], openmlids=args.openml_ids)
all_results_all_workflows = lcdb.db.LCDB().query(
workflows=[args.workflow_name],
openmlids=args.openml_ids,
processors={
anchor_size_column: lcdb.analysis.json.QueryAnchorValues(),
learning_curve_column: lambda x: list(map(
lambda x: 1 - lcdb.analysis.score.balanced_accuracy_from_confusion_matrix(x),
lcdb.analysis.json.QueryMetricValuesFromAnchors("confusion_matrix", split_name="val")(x)
))
}
)
load_count = 0
for frame_workflow_job_task in all_results_all_workflows:
workflow_ids = frame_workflow_job_task['m:workflow'].unique()
openml_task_ids = frame_workflow_job_task['m:openmlid'].unique()
# job_ids = frame_workflow_job_task['job_id'].unique()
if len(workflow_ids) > 1 or len(openml_task_ids) > 1:
raise ValueError('Should not happen. %s %s' % (str(workflow_ids), str(openml_task_ids)))
if (workflow_ids[0], openml_task_ids[0]) not in id_results:
id_results[(workflow_ids[0], openml_task_ids[0])] = list()
id_results[(workflow_ids[0], openml_task_ids[0])].append(frame_workflow_job_task)

for current_anchor_value in args.anchor_values:
if (workflow_ids[0], openml_task_ids[0], current_anchor_value) not in id_results:
id_results[(workflow_ids[0], openml_task_ids[0], current_anchor_value)] = list()

performance_values_new = list()
for index, row in frame_workflow_job_task.iterrows():
anchor_sizes = row[anchor_size_column]
performance_value_at_anchor = np.nan
if current_anchor_value != -1:
if current_anchor_value not in anchor_sizes:
logging.warning('Anchor %d not available in task %d workflow %s'
% (current_anchor_value, openml_task_ids[0], workflow_ids[0])
)
else:
anchor_index = anchor_sizes.index(current_anchor_value)
performance_value_at_anchor = row[learning_curve_column][anchor_index]
else:
performance_value_at_anchor = row[learning_curve_column][-1]
performance_values_new.append(performance_value_at_anchor)
performance_values_new = np.array(performance_values_new, dtype=float)

# make a copy
frame_copy = frame_workflow_job_task.copy(deep=True)
frame_copy[performance_column] = pd.Series(performance_values_new)
id_results[(workflow_ids[0], openml_task_ids[0], current_anchor_value)].append(frame_copy)

load_count += 1
if args.max_load and load_count >= args.max_load:
break

task_ids = set()
for idx, (workflow_name, task_id) in enumerate(id_results):
for idx, (workflow_name, task_id, current_anchor_value) in enumerate(id_results):
task_ids.add(task_id)
task_results = pd.concat(id_results[(workflow_name, task_id)])
task_results = pd.concat(id_results[(workflow_name, task_id, current_anchor_value)])
task_results = task_results.rename(workflow_hyperparameter_mapping, axis=1)
relevant_columns = list(workflow_hyperparameter_mapping.values()) + [performance_column, curve_data_column]
relevant_columns = list(workflow_hyperparameter_mapping.values()) + [performance_column]
task_results = task_results[relevant_columns]

logging.info("Starting with task %d (%d/%d)" % (task_id, idx + 1, len(id_results)))
fanova_task_results = fanova_on_task(task_results, performance_column, curve_data_column, config_space, args.n_trees)
nan_count = task_results[performance_column].isna().sum()
logging.info("Starting with task %d anchor %d (%d/%d), shape %s %d nans" % (
task_id, current_anchor_value, idx + 1, len(id_results), task_results.shape, nan_count)
)

fanova_task_results = fanova_on_task(
task_results, performance_column, current_anchor_value, config_space, args.n_trees
)
fanova_all_results.extend(fanova_task_results)

fanova_all_results = pd.DataFrame(fanova_all_results)

# generate plot
fig, ax = plt.subplots(figsize=(16, 9))
sns.boxplot(x="hyperparameter", y="fanova", data=fanova_all_results, ax=ax)
sns.boxplot(x="hyperparameter", y="variance_contribution", hue="anchor", data=fanova_all_results, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
ax.set_ylabel("Variance Contribution")
ax.set_xlabel(None)
plt.title('hyperparameter importance %s on task ids %s' % (args.workflow_name, str(task_ids)))
plt.tight_layout()

# save plot to file
output_file = args.output_directory + '/fanova_%s.%s' % (args.workflow_name, args.output_filetype)
filename_suffix = ""
if args.anchor_values is not None:
filename_suffix = "_anchor_%s" % str(args.anchor_values)
output_file_base = args.output_directory + '/fanova_%s%s' % (args.workflow_name, filename_suffix)
os.makedirs(args.output_directory, exist_ok=True)
plt.savefig(output_file)
logging.info('saved to %s' % output_file)
fanova_all_results.to_csv(output_file_base + '.csv')
plt.savefig(output_file_base + '.' + args.output_filetype)
logging.info('saved plot to %s.%s' % (output_file_base, args.output_filetype))
logging.info('saved csv to %s.csv' % output_file_base)


if __name__ == '__main__':
Expand Down
3 changes: 2 additions & 1 deletion publications/2023-neurips/lcdb/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from ._repository import Repository
from ._local_repository import LocalRepository
from ._pcloud_repository import PCloudRepository
from ._database import LCDB

__all__ = ["LCDB", "Repository", "LocalRepository"]
__all__ = ["LCDB", "Repository", "LocalRepository", "PCloudRepository"]
108 changes: 102 additions & 6 deletions publications/2023-neurips/lcdb/db/_database.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os
import pathlib

import re
import pandas as pd
from lcdb.db._repository import Repository
from lcdb.db._util import get_path_to_lcdb, CountAwareGenerator
Expand Down Expand Up @@ -44,7 +44,10 @@ def create(self, config=None):
self.path.mkdir(exist_ok=True, parents=True)

# create default config file
default_config = {"repositories": {"local": ".lcdb/data"}}
default_config = {"repositories": {
"official": "pcloud://kZK9f70Zxwwjkt54zA8FY6kBUFB5PXoAYT9k",
"local": ".lcdb/data"}
}
if config is not None:
default_config.update(config)
config = default_config
Expand Down Expand Up @@ -73,9 +76,10 @@ def _load(self):
cfg = json.load(f)
repository_paths = {}
for k, p in cfg["repositories"].items():
p = os.path.expanduser(p)
if p[:1] != "/":
p = f"{self.path.parent}/{p}"
if not p.startswith("pcloud://"):
p = os.path.expanduser(p)
if p[:1] != "/":
p = f"{self.path.parent}/{p}"
repository_paths[k] = p

self._repositories = {}
Expand Down Expand Up @@ -176,4 +180,96 @@ def generator():
if workflows is not None and len(workflows) == 1:
return dfs_per_workflow[workflows[0]] if workflows[0] in dfs_per_workflow else None
else:
return dfs_per_workflow
return dfs_per_workflow


def debug(
self,
repositories=None,
campaigns=None,
workflows=None,
openmlids=None,
workflow_seeds=None,
test_seeds=None,
validation_seeds=None,
show_progress=False
):
"""
Retrieves only rows that contain a traceback and their associated configs.
"""
if not self.loaded:
self._load()

if repositories is None:
repositories = list(self.repositories.values())
else:
requested_repository_names = set(repositories)
existing_repository_names = set(self.repositories.keys())
if (
len(requested_repository_names.difference(existing_repository_names))
> 0
):
raise Exception(
f"The following repositories were included in the query but do not exist in this LCDB_debug: "
f"{requested_repository_names.difference(existing_repository_names)}"
)
repositories = [self.repositories[k] for k in requested_repository_names]

if workflows is not None and isinstance(workflows, str):
workflows = [workflows]

result_generators = []
for repository in repositories:
if repository.exists():
result_generators.append(
repository.query_results_as_stream(
campaigns=campaigns,
workflows=workflows,
openmlids=openmlids,
workflow_seeds=workflow_seeds,
test_seeds=test_seeds,
validation_seeds=validation_seeds,
)
)

def generator():
for gen in result_generators:
for res in gen:
yield res

gen = CountAwareGenerator(sum([len(g) for g in result_generators]), generator())

tracebacks, configs, errors = [], [], []

for df in tqdm(gen, disable=not show_progress):
# check if "traceback" column exists
if "m:traceback" in df.columns:
traceback_rows = df[df["m:traceback"].notna()]

# extract corresponding configuration parameters
if not traceback_rows.empty:
traceback_indices = traceback_rows.index.tolist()
config_cols = [c for c in df.columns if c.startswith("p:")]
# corresponding_configs = df.loc[traceback_rows.index]
# configs.append(corresponding_configs)
corresponding_configs_reset = df.loc[traceback_indices, config_cols].drop_duplicates().reset_index(drop=True)
configs.append(corresponding_configs_reset)

tracebacks.append(traceback_rows["m:traceback"])

# extract errors from traceback messages str format first
traceback_str = str(traceback_rows["m:traceback"].iloc[0])
try:
error_message = re.search(r'(\w+Error): (.*)', traceback_str).group(0)
except:
error_message = traceback_str
errors.append(error_message)

else:
print("Error: no traceback column in dataframe")

return {
"configs": pd.concat(configs, ignore_index=True) if configs else None,
"tracebacks": pd.concat(tracebacks, ignore_index=True) if tracebacks else None,
"errors": pd.Series(errors) if errors else None
}
3 changes: 0 additions & 3 deletions publications/2023-neurips/lcdb/db/_local_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
import time

import pandas as pd
from tqdm import tqdm

from lcdb.db._dataframe import deserialize_dataframe
from lcdb.db._repository import Repository
from lcdb.analysis.json import JsonQuery
from ._util import CountAwareGenerator

from tqdm import tqdm

class LocalRepository(Repository):

Expand Down
Loading

0 comments on commit 9b227e4

Please sign in to comment.