Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving the performance of check_datasets_active, modifying unit test #980

Merged
merged 4 commits into from
Oct 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Changelog

0.11.1
~~~~~~
* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.

0.11.0
~~~~~~
Expand Down
14 changes: 12 additions & 2 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,14 +333,23 @@ def _load_features_from_file(features_file: str) -> Dict:
return xml_dict["oml:data_features"]


def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
def check_datasets_active(
dataset_ids: List[int],
raise_error_if_not_exist: bool = True,
) -> Dict[int, bool]:
"""
Check if the dataset ids provided are active.

Raises an error if a dataset_id in the given list
of dataset_ids does not exist on the server.

Parameters
----------
dataset_ids : List[int]
A list of integers representing dataset ids.
raise_error_if_not_exist : bool (default=True)
Flag that if activated can raise an error, if one or more of the
given dataset ids do not exist on the server.

Returns
-------
Expand All @@ -353,7 +362,8 @@ def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
for did in dataset_ids:
dataset = dataset_list.get(did, None)
if dataset is None:
raise ValueError("Could not find dataset {} in OpenML dataset list.".format(did))
if raise_error_if_not_exist:
raise ValueError(f'Could not find dataset {did} in OpenML dataset list.')
else:
active[did] = dataset["status"] == "active"

Expand Down
6 changes: 5 additions & 1 deletion tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,13 @@ def test_list_datasets_empty(self):
def test_check_datasets_active(self):
# Have to test on live because there is no deactivated dataset on the test server.
openml.config.server = self.production_server
active = openml.datasets.check_datasets_active([2, 17])
active = openml.datasets.check_datasets_active(
[2, 17, 79],
raise_error_if_not_exist=False,
)
self.assertTrue(active[2])
self.assertFalse(active[17])
self.assertIsNone(active.get(79))
self.assertRaisesRegex(
ValueError,
"Could not find dataset 79 in OpenML dataset list.",
Expand Down