Skip to content

Commit

Permalink
Merge branch 'deephyper' of https://github.com/fmohr/lcdb.git into de…
Browse files Browse the repository at this point in the history
…ephyper
  • Loading branch information
felix committed Oct 22, 2024
2 parents bd2d2f6 + 8c7a7ae commit 8bde36b
Show file tree
Hide file tree
Showing 8 changed files with 634 additions and 2,335 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Continuous integration

on:
- pull_request
- push


jobs:

test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version:
- "3.12"
defaults:
run:
working-directory: publications/2023-neurips/
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
pip install --upgrade pip
pip install tox pylint black
# - name: Run Formatter
# run: black --diff --check $(git ls-files '*.py')
- name: Run Linter
run: pylint --exit-zero $(git ls-files '*.py')
- name: Run tests with tox
run: tox -e py3
- name: Upload coverage report
if: ${{ matrix.python-version == 3.12 }} # Only upload coverage once
uses: codecov/codecov-action@v1
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,22 @@
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--n_trees', type=int, default=16)
parser.add_argument('--openml_ids', type=int, nargs='+', default=None)
parser.add_argument('--openml_ids', type=int, nargs='+', default=[3, 6])
parser.add_argument('--workflow_name', type=str, default="lcdb.workflow.sklearn.LibLinearWorkflow")
parser.add_argument('--openml_taskid_name', type=str, default="m:openmlid")
parser.add_argument('--output_directory', type=str, default=os.path.expanduser('~/experiments/lcdb'))
parser.add_argument('--output_filetype', type=str, choices=['pdf', 'png'], default='png')
parser.add_argument('--max_load', type=int, default=None)
parser.add_argument('--anchor_values', type=int, nargs='+', default=[128, 512, 2048, -1])
return parser.parse_args()


def numeric_encode(df, config_space):
# https://automl.github.io/ConfigSpace/latest/api/ConfigSpace/configuration_space/
result = np.zeros((len(df), len(config_space.get_hyperparameters())), dtype=float)
result = np.zeros((len(df), len(config_space.values())), dtype=float)

for hyperparameter_name, hyperparameter in config_space.items():
index = config_space.get_idx_by_hyperparameter_name(hyperparameter_name)
index = config_space.index_of[hyperparameter_name]
if isinstance(hyperparameter, ConfigSpace.hyperparameters.NumericalHyperparameter):
result[:, index] = df[hyperparameter_name].to_numpy()
elif isinstance(hyperparameter, ConfigSpace.hyperparameters.Constant):
Expand All @@ -43,92 +44,128 @@ def numeric_encode(df, config_space):
return result


def fanova_on_task(task_results, performance_column_name, curve_data_column, config_space, n_trees):
def fanova_on_task(task_results, performance_column_name, current_anchor_value, config_space, n_trees):
fanova_results = []

# query_confusion_matrix_values = lcdb.analysis.json.QueryMetricValuesFromAnchors("confusion_matrix", split_name="val")
# out = task_results[performance_column_name].apply(query_confusion_matrix_values)
# print(out)
# balanced_error_rate_values_for_config = np.array(
# out.apply(lambda x: list(map(lambda x: 1 - lcdb.analysis.score.balanced_accuracy_from_confusion_matrix(x), x))).to_list())
# print(balanced_error_rate_values_for_config.mean(axis=0))
# print(out)
evaluator = fanova.fanova.fANOVA(
X=numeric_encode(task_results, config_space),
Y=task_results[performance_column_name].to_numpy(),
config_space=config_space,
n_trees=n_trees,
)
for idx, pname in enumerate(config_space.get_hyperparameter_names()):
for idx, pname in enumerate(config_space.keys()):
logging.info('-- hyperparameter %d %s' % (idx, pname))
unique_values = task_results[pname].unique()
logging.info('-- UNIQUE VALUES: %d (%s)' % (len(unique_values), unique_values))
importance = evaluator.quantify_importance([idx])

fanova_results.append(
{
"hyperparameter": pname,
"fanova": importance[(idx,)]["individual importance"],
}
)

fanova_results.append({
"hyperparameter": pname,
"anchor": current_anchor_value,
"variance_contribution": importance[(idx,)]["individual importance"],
})
return fanova_results


def run(args):
fanova_all_results = []
performance_column = "objective"
curve_data_column = "m:json"
performance_column = "final_objective" # make sure to give this a unique name (not same as the "objective" field)
anchor_size_column = "anchor_sizes"
learning_curve_column = "learning_curve_data"

WorkflowClass = lcdb.builder.utils.import_attr_from_module(args.workflow_name)
config_space = WorkflowClass.config_space()
workflow_hyperparameter_mapping = {"p:" + name: name for name in config_space.get_hyperparameter_names()}
workflow_hyperparameter_mapping = {"p:" + name: name for name in config_space.keys()}
id_results = dict()

all_results_all_workflows = lcdb.db.LCDB().query(workflows=[args.workflow_name], openmlids=args.openml_ids)
all_results_all_workflows = lcdb.db.LCDB().query(
workflows=[args.workflow_name],
openmlids=args.openml_ids,
processors={
anchor_size_column: lcdb.analysis.json.QueryAnchorValues(),
learning_curve_column: lambda x: list(map(
lambda x: 1 - lcdb.analysis.score.balanced_accuracy_from_confusion_matrix(x),
lcdb.analysis.json.QueryMetricValuesFromAnchors("confusion_matrix", split_name="val")(x)
))
}
)
load_count = 0
for frame_workflow_job_task in all_results_all_workflows:
workflow_ids = frame_workflow_job_task['m:workflow'].unique()
openml_task_ids = frame_workflow_job_task['m:openmlid'].unique()
# job_ids = frame_workflow_job_task['job_id'].unique()
if len(workflow_ids) > 1 or len(openml_task_ids) > 1:
raise ValueError('Should not happen. %s %s' % (str(workflow_ids), str(openml_task_ids)))
if (workflow_ids[0], openml_task_ids[0]) not in id_results:
id_results[(workflow_ids[0], openml_task_ids[0])] = list()
id_results[(workflow_ids[0], openml_task_ids[0])].append(frame_workflow_job_task)

for current_anchor_value in args.anchor_values:
if (workflow_ids[0], openml_task_ids[0], current_anchor_value) not in id_results:
id_results[(workflow_ids[0], openml_task_ids[0], current_anchor_value)] = list()

performance_values_new = list()
for index, row in frame_workflow_job_task.iterrows():
anchor_sizes = row[anchor_size_column]
performance_value_at_anchor = np.nan
if current_anchor_value != -1:
if current_anchor_value not in anchor_sizes:
logging.warning('Anchor %d not available in task %d workflow %s'
% (current_anchor_value, openml_task_ids[0], workflow_ids[0])
)
else:
anchor_index = anchor_sizes.index(current_anchor_value)
performance_value_at_anchor = row[learning_curve_column][anchor_index]
else:
performance_value_at_anchor = row[learning_curve_column][-1]
performance_values_new.append(performance_value_at_anchor)
performance_values_new = np.array(performance_values_new, dtype=float)

# make a copy
frame_copy = frame_workflow_job_task.copy(deep=True)
frame_copy[performance_column] = pd.Series(performance_values_new)
id_results[(workflow_ids[0], openml_task_ids[0], current_anchor_value)].append(frame_copy)

load_count += 1
if args.max_load and load_count >= args.max_load:
break

task_ids = set()
for idx, (workflow_name, task_id) in enumerate(id_results):
for idx, (workflow_name, task_id, current_anchor_value) in enumerate(id_results):
task_ids.add(task_id)
task_results = pd.concat(id_results[(workflow_name, task_id)])
task_results = pd.concat(id_results[(workflow_name, task_id, current_anchor_value)])
task_results = task_results.rename(workflow_hyperparameter_mapping, axis=1)
relevant_columns = list(workflow_hyperparameter_mapping.values()) + [performance_column, curve_data_column]
relevant_columns = list(workflow_hyperparameter_mapping.values()) + [performance_column]
task_results = task_results[relevant_columns]

logging.info("Starting with task %d (%d/%d)" % (task_id, idx + 1, len(id_results)))
fanova_task_results = fanova_on_task(task_results, performance_column, curve_data_column, config_space, args.n_trees)
nan_count = task_results[performance_column].isna().sum()
logging.info("Starting with task %d anchor %d (%d/%d), shape %s %d nans" % (
task_id, current_anchor_value, idx + 1, len(id_results), task_results.shape, nan_count)
)

fanova_task_results = fanova_on_task(
task_results, performance_column, current_anchor_value, config_space, args.n_trees
)
fanova_all_results.extend(fanova_task_results)

fanova_all_results = pd.DataFrame(fanova_all_results)

# generate plot
fig, ax = plt.subplots(figsize=(16, 9))
sns.boxplot(x="hyperparameter", y="fanova", data=fanova_all_results, ax=ax)
sns.boxplot(x="hyperparameter", y="variance_contribution", hue="anchor", data=fanova_all_results, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
ax.set_ylabel("Variance Contribution")
ax.set_xlabel(None)
plt.title('hyperparameter importance %s on task ids %s' % (args.workflow_name, str(task_ids)))
plt.tight_layout()

# save plot to file
output_file = args.output_directory + '/fanova_%s.%s' % (args.workflow_name, args.output_filetype)
filename_suffix = ""
if args.anchor_values is not None:
filename_suffix = "_anchor_%s" % str(args.anchor_values)
output_file_base = args.output_directory + '/fanova_%s%s' % (args.workflow_name, filename_suffix)
os.makedirs(args.output_directory, exist_ok=True)
plt.savefig(output_file)
logging.info('saved to %s' % output_file)
fanova_all_results.to_csv(output_file_base + '.csv')
plt.savefig(output_file_base + '.' + args.output_filetype)
logging.info('saved plot to %s.%s' % (output_file_base, args.output_filetype))
logging.info('saved csv to %s.csv' % output_file_base)


if __name__ == '__main__':
Expand Down
96 changes: 94 additions & 2 deletions publications/2023-neurips/lcdb/db/_database.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os
import pathlib

import re
import pandas as pd
from lcdb.db._repository import Repository
from lcdb.db._util import get_path_to_lcdb, CountAwareGenerator
Expand Down Expand Up @@ -177,4 +177,96 @@ def generator():
if workflows is not None and len(workflows) == 1:
return dfs_per_workflow[workflows[0]] if workflows[0] in dfs_per_workflow else None
else:
return dfs_per_workflow
return dfs_per_workflow


def debug(
self,
repositories=None,
campaigns=None,
workflows=None,
openmlids=None,
workflow_seeds=None,
test_seeds=None,
validation_seeds=None,
show_progress=False
):
"""
Retrieves only rows that contain a traceback and their associated configs.
"""
if not self.loaded:
self._load()

if repositories is None:
repositories = list(self.repositories.values())
else:
requested_repository_names = set(repositories)
existing_repository_names = set(self.repositories.keys())
if (
len(requested_repository_names.difference(existing_repository_names))
> 0
):
raise Exception(
f"The following repositories were included in the query but do not exist in this LCDB_debug: "
f"{requested_repository_names.difference(existing_repository_names)}"
)
repositories = [self.repositories[k] for k in requested_repository_names]

if workflows is not None and isinstance(workflows, str):
workflows = [workflows]

result_generators = []
for repository in repositories:
if repository.exists():
result_generators.append(
repository.query_results_as_stream(
campaigns=campaigns,
workflows=workflows,
openmlids=openmlids,
workflow_seeds=workflow_seeds,
test_seeds=test_seeds,
validation_seeds=validation_seeds,
)
)

def generator():
for gen in result_generators:
for res in gen:
yield res

gen = CountAwareGenerator(sum([len(g) for g in result_generators]), generator())

tracebacks, configs, errors = [], [], []

for df in tqdm(gen, disable=not show_progress):
# check if "traceback" column exists
if "m:traceback" in df.columns:
traceback_rows = df[df["m:traceback"].notna()]

# extract corresponding configuration parameters
if not traceback_rows.empty:
traceback_indices = traceback_rows.index.tolist()
config_cols = [c for c in df.columns if c.startswith("p:")]
# corresponding_configs = df.loc[traceback_rows.index]
# configs.append(corresponding_configs)
corresponding_configs_reset = df.loc[traceback_indices, config_cols].drop_duplicates().reset_index(drop=True)
configs.append(corresponding_configs_reset)

tracebacks.append(traceback_rows["m:traceback"])

# extract errors from traceback messages str format first
traceback_str = str(traceback_rows["m:traceback"].iloc[0])
try:
error_message = re.search(r'(\w+Error): (.*)', traceback_str).group(0)
except:
error_message = traceback_str
errors.append(error_message)

else:
print("Error: no traceback column in dataframe")

return {
"configs": pd.concat(configs, ignore_index=True) if configs else None,
"tracebacks": pd.concat(tracebacks, ignore_index=True) if tracebacks else None,
"errors": pd.Series(errors) if errors else None
}
6 changes: 6 additions & 0 deletions publications/2023-neurips/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[pytest]
norecursedirs = .git
markers =
db: marks to define a test that requires a local database from LCDB data.
filterwarnings =
ignore:The objective has been evaluated at this point before.:UserWarning
4 changes: 2 additions & 2 deletions publications/2023-neurips/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ classifiers =
Development Status :: 3 - Alpha
Programming Language :: Python
Programming Language :: Python :: 3
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12

[options]
packages = find:
Expand Down
Loading

0 comments on commit 8bde36b

Please sign in to comment.