diff --git a/.travis.yml b/.travis.yml index 675186469..beaa3b53e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,10 +15,11 @@ env: - TEST_DIR=/tmp/test_dir/ - MODULE=openml matrix: - - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.20.0" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.0" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" RUN_FLAKE8="true" SKIP_TESTS="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" COVERAGE="true" DOCPUSH="true" + - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2" + - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true" + - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2" # Checks for older scikit-learn versions (which also don't nicely work with # Python3.7) - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b13051d67..5a77dfd58 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,6 +81,10 @@ following rules before you submit a pull request: Drafts often benefit from the inclusion of a [task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments) in the PR description. + +- Add [unit tests](https://github.com/openml/openml-python/tree/develop/tests) and [examples](https://github.com/openml/openml-python/tree/develop/examples) for any new functionality being introduced. + - If an unit test contains an upload to the test server, please ensure that it is followed by a file collection for deletion, to prevent the test server from bulking up. For example, `TestBase._mark_entity_for_removal('data', dataset.dataset_id)`, `TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))`. + - Please ensure that the example is run on the test server by beginning with the call to `openml.config.start_using_configuration_for_example()`. - All tests pass when running `pytest`. On Unix-like systems, check with (from the toplevel source folder): diff --git a/LICENSE b/LICENSE index 146b8cc36..e08aa862b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2014-2018, Matthias Feurer, Jan van Rijn, Andreas Müller, +Copyright (c) 2014-2019, Matthias Feurer, Jan van Rijn, Andreas Müller, Joaquin Vanschoren and others. All rights reserved. diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md index 4cedd1478..571ae0d1c 100644 --- a/PULL_REQUEST_TEMPLATE.md +++ b/PULL_REQUEST_TEMPLATE.md @@ -9,6 +9,8 @@ Please make sure that: * for any new function or class added, please add it to doc/api.rst * the list of classes and functions should be alphabetical * for any new functionality, consider adding a relevant example +* add unit tests for new functionalities + * collect files uploaded to test server using _mark_entity_for_removal() --> #### Reference Issue diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh index 80b35f04f..1c82591e0 100644 --- a/ci_scripts/test.sh +++ b/ci_scripts/test.sh @@ -1,5 +1,11 @@ set -e +# check status and branch before running the unit tests +before="`git status --porcelain -b`" +before="$before" +# storing current working directory +curr_dir=`pwd` + run_tests() { # Get into a temp directory to run test from the installed scikit learn and # check if we do not leave artifacts @@ -22,7 +28,7 @@ run_tests() { PYTEST_ARGS='' fi - pytest -n 4 --duration=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir + pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir } if [[ "$RUN_FLAKE8" == "true" ]]; then @@ -32,3 +38,15 @@ fi if [[ "$SKIP_TESTS" != "true" ]]; then run_tests fi + +# changing directory to stored working directory +cd $curr_dir +# check status and branch after running the unit tests +# compares with $before to check for remaining files +after="`git status --porcelain -b`" +if [[ "$before" != "$after" ]]; then + echo 'git status from before: '$before + echo 'git status from after: '$after + echo "All generated files have not been deleted!" + exit 1 +fi diff --git a/doc/api.rst b/doc/api.rst index 93a6d18b6..7979c7bfc 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -72,6 +72,7 @@ Modules get_dataset get_datasets list_datasets + list_qualities status_update :mod:`openml.evaluations`: Evaluation Functions @@ -83,6 +84,7 @@ Modules :template: function.rst list_evaluations + list_evaluation_measures :mod:`openml.flows`: Flow Functions ----------------------------------- diff --git a/doc/conf.py b/doc/conf.py index 9b49078fb..03a2ec0db 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -15,6 +15,7 @@ import os import sys import sphinx_bootstrap_theme +import time import openml # If extensions (or modules to document with autodoc) are in another directory, @@ -65,7 +66,7 @@ # General information about the project. project = u'OpenML' copyright = ( - u'2014-2019, the OpenML-Python team.' + u'2014-{}, the OpenML-Python team.'.format(time.strftime("%Y,%m,%d,%H,%M,%S").split(',')[0]) ) # The version info for the project you're documenting, acts as replacement for diff --git a/doc/index.rst b/doc/index.rst index 8752dbe9b..96e534705 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -21,16 +21,12 @@ Example .. code:: python import openml - from sklearn import preprocessing, tree, pipeline - - # Set the OpenML API Key which is required to upload your runs. - # You can get your own API by signing up to OpenML.org. - openml.config.apikey = 'ABC' + from sklearn import impute, tree, pipeline # Define a scikit-learn classifier or pipeline clf = pipeline.Pipeline( steps=[ - ('imputer', preprocessing.Imputer()), + ('imputer', impute.SimpleImputer()), ('estimator', tree.DecisionTreeClassifier()) ] ) @@ -39,10 +35,13 @@ Example task = openml.tasks.get_task(31) # Run the scikit-learn model on the task. run = openml.runs.run_model_on_task(clf, task) - # Publish the experiment on OpenML (optional, requires an API key). + # Publish the experiment on OpenML (optional, requires an API key. + # You can get your own API key by signing up to OpenML.org) run.publish() print('View the run online: %s/run/%d' % (openml.config.server, run.run_id)) +You can find more examples in our `examples gallery `_. + ---------------------------- How to get OpenML for python ---------------------------- diff --git a/doc/progress.rst b/doc/progress.rst index 5629eb0cb..33db154ef 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -6,6 +6,27 @@ Changelog ========= +0.10.0 +~~~~~~ +* ADD #737: Add list_evaluations_setups to return hyperparameters along with list of evaluations. +* FIX #261: Test server is cleared of all files uploaded during unit testing. +* FIX #447: All files created by unit tests no longer persist in local. +* FIX #608: Fixing dataset_id referenced before assignment error in get_run function. +* FIX #447: All files created by unit tests are deleted after the completion of all unit tests. +* FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset. +* FIX #608: Fixing dataset_id referenced before assignment error in get_run function. +* DOC #639: More descriptive documention for function to convert array format. +* DOC #719: Add documentation on uploading tasks. +* ADD #687: Adds a function to retrieve the list of evaluation measures available. +* ADD #695: A function to retrieve all the data quality measures available. +* ADD #412: Add a function to trim flow names for scikit-learn flows. +* ADD #715: `list_evaluations` now has an option to sort evaluations by score (value). +* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible. +* ADD #412: The scikit-learn extension populates the short name field for flows. +* MAINT #726: Update examples to remove deprecation warnings from scikit-learn +* MAINT #752: Update OpenML-Python to be compatible with sklearn 0.21 + + 0.9.0 ~~~~~ * ADD #560: OpenML-Python can now handle regression tasks as well. @@ -21,6 +42,7 @@ Changelog * ADD #659: Lazy loading of task splits. * ADD #516: `run_flow_on_task` flow uploading is now optional. * ADD #680: Adds `openml.config.start_using_configuration_for_example` (and resp. stop) to easily connect to the test server. +* ADD #75, #653: Adds a pretty print for objects of the top-level classes. * FIX #642: `check_datasets_active` now correctly also returns active status of deactivated datasets. * FIX #304, #636: Allow serialization of numpy datatypes and list of lists of more types (e.g. bools, ints) for flows. * FIX #651: Fixed a bug that would prevent openml-python from finding the user's config file. diff --git a/examples/fetch_evaluations_tutorial.py b/examples/fetch_evaluations_tutorial.py index 97872e9f7..10511c540 100644 --- a/examples/fetch_evaluations_tutorial.py +++ b/examples/fetch_evaluations_tutorial.py @@ -20,7 +20,6 @@ ############################################################################ import openml -from pprint import pprint ############################################################################ # Listing evaluations @@ -37,7 +36,7 @@ output_format='dataframe') # Querying the returned results for precision above 0.98 -pprint(evals[evals.value > 0.98]) +print(evals[evals.value > 0.98]) ############################################################################# # Viewing a sample task @@ -47,7 +46,7 @@ # We will start by displaying a simple *supervised classification* task: task_id = 167140 # https://www.openml.org/t/167140 task = openml.tasks.get_task(task_id) -pprint(vars(task)) +print(task) ############################################################################# # Obtaining all the evaluations for the task @@ -60,11 +59,11 @@ evals = openml.evaluations.list_evaluations(function=metric, task=[task_id], output_format='dataframe') # Displaying the first 10 rows -pprint(evals.head(n=10)) +print(evals.head(n=10)) # Sorting the evaluations in decreasing order of the metric chosen evals = evals.sort_values(by='value', ascending=False) print("\nDisplaying head of sorted dataframe: ") -pprint(evals.head()) +print(evals.head()) ############################################################################# # Obtaining CDF of metric for chosen task @@ -147,4 +146,4 @@ def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'): flow_ids = evals.flow_id.unique()[:top_n] flow_names = evals.flow_name.unique()[:top_n] for i in range(top_n): - pprint((flow_ids[i], flow_names[i])) + print((flow_ids[i], flow_names[i])) diff --git a/examples/flows_and_runs_tutorial.py b/examples/flows_and_runs_tutorial.py index d196c30ee..d65abdf28 100644 --- a/examples/flows_and_runs_tutorial.py +++ b/examples/flows_and_runs_tutorial.py @@ -6,8 +6,7 @@ """ import openml -from pprint import pprint -from sklearn import ensemble, neighbors, preprocessing, pipeline, tree +from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree ############################################################################ # Train machine learning models @@ -39,8 +38,9 @@ target=dataset.default_target_attribute ) print("Categorical features: {}".format(categorical_indicator)) -enc = preprocessing.OneHotEncoder(categorical_features=categorical_indicator) -X = enc.fit_transform(X) +transformer = compose.ColumnTransformer( + [('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)]) +X = transformer.fit_transform(X) clf.fit(X, y) ############################################################################ @@ -57,7 +57,7 @@ # Run the flow run = openml.runs.run_model_on_task(clf, task) -# pprint(vars(run), depth=2) +print(run) ############################################################################ # Share the run on the OpenML server @@ -74,18 +74,38 @@ # We can now also inspect the flow object which was automatically created: flow = openml.flows.get_flow(run.flow_id) -pprint(vars(flow), depth=1) +print(flow) ############################################################################ # It also works with pipelines # ############################ # # When you need to handle 'dirty' data, build pipelines to model then automatically. -task = openml.tasks.get_task(115) +task = openml.tasks.get_task(1) +features = task.get_dataset().features +nominal_feature_indices = [ + i for i in range(len(features)) + if features[i].name != task.target_name and features[i].data_type == 'nominal' +] pipe = pipeline.Pipeline(steps=[ - ('Imputer', preprocessing.Imputer(strategy='median')), - ('OneHotEncoder', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')), - ('Classifier', ensemble.RandomForestClassifier()) + ( + 'Preprocessing', + compose.ColumnTransformer([ + ('Nominal', pipeline.Pipeline( + [ + ('Imputer', impute.SimpleImputer(strategy='most_frequent')), + ( + 'Encoder', + preprocessing.OneHotEncoder( + sparse=False, handle_unknown='ignore', + ) + ), + ]), + nominal_feature_indices, + ), + ]), + ), + ('Classifier', ensemble.RandomForestClassifier(n_estimators=10)) ]) run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False) diff --git a/examples/introduction_tutorial.py b/examples/introduction_tutorial.py index 7dc3a8324..9cd88ceba 100644 --- a/examples/introduction_tutorial.py +++ b/examples/introduction_tutorial.py @@ -1,6 +1,6 @@ """ Introduction -=================== +============ An introduction to OpenML, followed up by a simple example. """ @@ -15,6 +15,8 @@ # * Works seamlessly with scikit-learn and other libraries # * Large scale benchmarking, compare to state of the art # + +############################################################################ # Installation # ^^^^^^^^^^^^ # Installation is done via ``pip``: @@ -26,6 +28,8 @@ # For further information, please check out the installation guide at # https://openml.github.io/openml-python/master/contributing.html#installation # + +############################################################################ # Authentication # ^^^^^^^^^^^^^^ # @@ -49,6 +53,7 @@ # .. warning:: This example uploads data. For that reason, this example # connects to the test server instead. This prevents the live server from # crowding with example datasets, tasks, studies, and so on. + ############################################################################ import openml from sklearn import neighbors diff --git a/examples/sklearn/openml_run_example.py b/examples/sklearn/openml_run_example.py index 84e11bd54..195a0aa77 100644 --- a/examples/sklearn/openml_run_example.py +++ b/examples/sklearn/openml_run_example.py @@ -5,7 +5,7 @@ An example of an automated machine learning experiment. """ import openml -from sklearn import tree, preprocessing, pipeline +from sklearn import impute, tree, pipeline ############################################################################ # .. warning:: This example uploads data. For that reason, this example @@ -21,7 +21,7 @@ # Define a scikit-learn pipeline clf = pipeline.Pipeline( steps=[ - ('imputer', preprocessing.Imputer()), + ('imputer', impute.SimpleImputer()), ('estimator', tree.DecisionTreeClassifier()) ] ) diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py index f1f07d027..c54ecdbd9 100644 --- a/examples/tasks_tutorial.py +++ b/examples/tasks_tutorial.py @@ -7,7 +7,6 @@ import openml import pandas as pd -from pprint import pprint ############################################################################ # @@ -40,11 +39,11 @@ tasks = pd.DataFrame.from_dict(tasks, orient='index') print(tasks.columns) print("First 5 of %s tasks:" % len(tasks)) -pprint(tasks.head()) +print(tasks.head()) # The same can be obtained through lesser lines of code tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe') -pprint(tasks_df.head()) +print(tasks_df.head()) ############################################################################ # We can filter the list of tasks to only contain datasets with more than @@ -78,7 +77,7 @@ tasks = openml.tasks.list_tasks(tag='OpenML100') tasks = pd.DataFrame.from_dict(tasks, orient='index') print("First 5 of %s tasks:" % len(tasks)) -pprint(tasks.head()) +print(tasks.head()) ############################################################################ # Furthermore, we can list tasks based on the dataset id: @@ -86,14 +85,14 @@ tasks = openml.tasks.list_tasks(data_id=1471) tasks = pd.DataFrame.from_dict(tasks, orient='index') print("First 5 of %s tasks:" % len(tasks)) -pprint(tasks.head()) +print(tasks.head()) ############################################################################ # In addition, a size limit and an offset can be applied both separately and simultaneously: tasks = openml.tasks.list_tasks(size=10, offset=50) tasks = pd.DataFrame.from_dict(tasks, orient='index') -pprint(tasks) +print(tasks) ############################################################################ # @@ -134,11 +133,87 @@ ############################################################################ # Properties of the task are stored as member variables: -pprint(vars(task)) +print(task) ############################################################################ # And: ids = [2, 1891, 31, 9983] tasks = openml.tasks.get_tasks(ids) -pprint(tasks[0]) +print(tasks[0]) + +############################################################################ +# Creating tasks +# ^^^^^^^^^^^^^^ +# +# You can also create new tasks. Take the following into account: +# +# * You can only create tasks on _active_ datasets +# * For now, only the following tasks are supported: classification, regression, +# clustering, and learning curve analysis. +# * For now, tasks can only be created on a single dataset. +# * The exact same task must not already exist. +# +# Creating a task requires the following input: +# +# * task_type_id: The task type ID, required (see below). Required. +# * dataset_id: The dataset ID. Required. +# * target_name: The name of the attribute you aim to predict. +# Optional. +# * estimation_procedure_id : The ID of the estimation procedure used to create train-test +# splits. Optional. +# * evaluation_measure: The name of the evaluation measure. Optional. +# * Any additional inputs for specific tasks +# +# It is best to leave the evaluation measure open if there is no strong prerequisite for a +# specific measure. OpenML will always compute all appropriate measures and you can filter +# or sort results on your favourite measure afterwards. Only add an evaluation measure if +# necessary (e.g. when other measure make no sense), since it will create a new task, which +# scatters results across tasks. + + +############################################################################ +# Example +# ####### +# +# Let's create a classification task on a dataset. In this example we will do this on the +# Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1), +# and _predictive accuracy_ as the predefined measure (this can also be left open). +# If a task with these parameters exist, we will get an appropriate exception. +# If such a task doesn't exist, a task will be created and the corresponding task_id +# will be returned. + + +# using test server for example uploads +openml.config.start_using_configuration_for_example() + +try: + tasktypes = openml.tasks.TaskTypeEnum + my_task = openml.tasks.create_task( + task_type_id=tasktypes.SUPERVISED_CLASSIFICATION, + dataset_id=128, + target_name="class", + evaluation_measure="predictive_accuracy", + estimation_procedure_id=1) + my_task.publish() +except openml.exceptions.OpenMLServerException as e: + # Error code for 'task already exists' + if e.code == 614: + # Lookup task + tasks = openml.tasks.list_tasks(data_id=128, output_format='dataframe').to_numpy() + tasks = tasks[tasks[:, 4] == "Supervised Classification"] + tasks = tasks[tasks[:, 6] == "10-fold Crossvalidation"] + tasks = tasks[tasks[:, 19] == "predictive_accuracy"] + task_id = tasks[0][0] + print("Task already exists. Task ID is", task_id) + +# reverting to prod server +openml.config.stop_using_configuration_for_example() + + +############################################################################ +# [Complete list of task types](https://www.openml.org/search?type=task_type) +# [Complete list of model estimation procedures]( +# https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure) +# [Complete list of evaluation measures]( +# https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure) diff --git a/openml/__version__.py b/openml/__version__.py index bfb63854a..fd6968a5d 100644 --- a/openml/__version__.py +++ b/openml/__version__.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.9.0" +__version__ = "0.10.0" diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index 78bc41237..8f52e16fc 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -6,6 +6,7 @@ get_datasets, list_datasets, status_update, + list_qualities ) from .dataset import OpenMLDataset from .data_feature import OpenMLDataFeature @@ -20,4 +21,5 @@ 'OpenMLDataset', 'OpenMLDataFeature', 'status_update', + 'list_qualities' ] diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index b271e63dc..077be639e 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -1,18 +1,19 @@ class OpenMLDataFeature(object): - """Data Feature (a.k.a. Attribute) object. + """ + Data Feature (a.k.a. Attribute) object. - Parameters - ---------- - index : int - The index of this feature - name : str - Name of the feature - data_type : str - can be nominal, numeric, string, date (corresponds to arff) - nominal_values : list(str) - list of the possible values, in case of nominal attribute - number_missing_values : int - """ + Parameters + ---------- + index : int + The index of this feature + name : str + Name of the feature + data_type : str + can be nominal, numeric, string, date (corresponds to arff) + nominal_values : list(str) + list of the possible values, in case of nominal attribute + number_missing_values : int + """ LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date'] def __init__(self, index, name, data_type, nominal_values, @@ -22,8 +23,16 @@ def __init__(self, index, name, data_type, nominal_values, if data_type not in self.LEGAL_DATA_TYPES: raise ValueError('data type should be in %s, found: %s' % (str(self.LEGAL_DATA_TYPES), data_type)) - if nominal_values is not None and type(nominal_values) != list: - raise ValueError('Nominal_values is of wrong datatype') + if data_type == 'nominal': + if nominal_values is None: + raise TypeError('Dataset features require attribute `nominal_values` for nominal ' + 'feature type.') + elif not isinstance(nominal_values, list): + raise TypeError('Argument `nominal_values` is of wrong datatype, should be list, ' + 'but is {}'.format(type(nominal_values))) + else: + if nominal_values is not None: + raise TypeError('Argument `nominal_values` must be None for non-nominal feature.') if type(number_missing_values) != int: raise ValueError('number_missing_values is of wrong datatype') @@ -33,7 +42,7 @@ def __init__(self, index, name, data_type, nominal_values, self.nominal_values = nominal_values self.number_missing_values = number_missing_values - def __str__(self): + def __repr__(self): return "[%d - %s (%s)]" % (self.index, self.name, self.data_type) def _repr_pretty_(self, pp, cycle): diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index b6833a513..630fac35e 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -132,9 +132,9 @@ def __init__(self, name, description, format=None, self.default_target_attribute = default_target_attribute self.row_id_attribute = row_id_attribute if isinstance(ignore_attribute, str): - self.ignore_attributes = [ignore_attribute] + self.ignore_attribute = [ignore_attribute] elif isinstance(ignore_attribute, list) or ignore_attribute is None: - self.ignore_attributes = ignore_attribute + self.ignore_attribute = ignore_attribute else: raise ValueError('Wrong data type for ignore_attribute. ' 'Should be list.') @@ -153,7 +153,6 @@ def __init__(self, name, description, format=None, if features is not None: self.features = {} - # todo add nominal values (currently not in database) for idx, xmlfeature in enumerate(features['oml:feature']): nr_missing = xmlfeature.get('oml:number_of_missing_values', 0) feature = OpenMLDataFeature(int(xmlfeature['oml:index']), @@ -173,6 +172,36 @@ def __init__(self, name, description, format=None, else: self.data_pickle_file = None + def __repr__(self): + header = "OpenML Dataset" + header = '{}\n{}\n'.format(header, '=' * len(header)) + + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"Name": self.name, + "Version": self.version, + "Format": self.format, + "Licence": self.licence, + "Download URL": self.url, + "Data file": self.data_file, + "Pickle file": self.data_pickle_file, + "# of features": len(self.features)} + if self.upload_date is not None: + fields["Upload Date"] = self.upload_date.replace('T', ' ') + if self.dataset_id is not None: + fields["OpenML URL"] = "{}d/{}".format(base_url, self.dataset_id) + if self.qualities['NumberOfInstances'] is not None: + fields["# of instances"] = int(self.qualities['NumberOfInstances']) + + # determines the order in which the information will be printed + order = ["Name", "Version", "Format", "Upload Date", "Licence", "Download URL", + "OpenML URL", "Data File", "Pickle File", "# of features", "# of instances"] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return header + body + def _data_arff_to_pickle(self, data_file): data_pickle_file = data_file.replace('.arff', '.pkl.py3') if os.path.exists(data_pickle_file): @@ -368,9 +397,25 @@ def decode_arff(fh): def _convert_array_format(data, array_format, attribute_names): """Convert a dataset to a given array format. - By default, the data are stored as a sparse matrix or a pandas - dataframe. One might be interested to get a pandas SparseDataFrame or a - NumPy array instead, respectively. + Converts to numpy array if data is non-sparse. + Converts to a sparse dataframe if data is sparse. + + Parameters + ---------- + array_format : str {'array', 'dataframe'} + Desired data type of the output + - If array_format='array' + If data is non-sparse + Converts to numpy-array + Enforces numeric encoding of categorical columns + Missing values are represented as NaN in the numpy-array + else returns data as is + - If array_format='dataframe' + If data is sparse + Works only on sparse data + Converts sparse data to sparse dataframe + else returns data as is + """ if array_format == "array" and not scipy.sparse.issparse(data): # We encode the categories such that they are integer to be able @@ -396,8 +441,11 @@ def _encode_if_category(column): 'PyOpenML cannot handle string when returning numpy' ' arrays. Use dataset_format="dataframe".' ) - if array_format == "dataframe" and scipy.sparse.issparse(data): + elif array_format == "dataframe" and scipy.sparse.issparse(data): return pd.SparseDataFrame(data, columns=attribute_names) + else: + data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data" + warn("Cannot convert {} to '{}'. Returning input data.".format(data_type, array_format)) return data @staticmethod @@ -423,7 +471,7 @@ def get_data( self, target: Optional[Union[List[str], str]] = None, include_row_id: bool = False, - include_ignore_attributes: bool = False, + include_ignore_attribute: bool = False, dataset_format: str = "dataframe", ) -> Tuple[ Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix], @@ -440,7 +488,7 @@ def get_data( Splitting multiple columns is currently not supported. include_row_id : boolean (default=False) Whether to include row ids in the returned dataset. - include_ignore_attributes : boolean (default=False) + include_ignore_attribute : boolean (default=False) Whether to include columns that are marked as "ignore" on the server in the dataset. dataset_format : string (default='dataframe') @@ -479,11 +527,11 @@ def get_data( elif isinstance(self.row_id_attribute, Iterable): to_exclude.extend(self.row_id_attribute) - if not include_ignore_attributes and self.ignore_attributes is not None: - if isinstance(self.ignore_attributes, str): - to_exclude.append(self.ignore_attributes) - elif isinstance(self.ignore_attributes, Iterable): - to_exclude.extend(self.ignore_attributes) + if not include_ignore_attribute and self.ignore_attribute is not None: + if isinstance(self.ignore_attribute, str): + to_exclude.append(self.ignore_attribute) + elif isinstance(self.ignore_attribute, Iterable): + to_exclude.extend(self.ignore_attribute) if len(to_exclude) > 0: logger.info("Going to remove the following attributes:" @@ -566,7 +614,7 @@ def retrieve_class_labels(self, target_name: str = 'class') -> Union[None, List[ return None def get_features_by_type(self, data_type, exclude=None, - exclude_ignore_attributes=True, + exclude_ignore_attribute=True, exclude_row_id_attribute=True): """ Return indices of features of a given type, e.g. all nominal features. @@ -579,7 +627,7 @@ def get_features_by_type(self, data_type, exclude=None, exclude : list(int) Indices to exclude (and adapt the return values as if these indices are not present) - exclude_ignore_attributes : bool + exclude_ignore_attribute : bool Whether to exclude the defined ignore attributes (and adapt the return values as if these indices are not present) exclude_row_id_attribute : bool @@ -593,9 +641,9 @@ def get_features_by_type(self, data_type, exclude=None, """ if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES: raise TypeError("Illegal feature type requested") - if self.ignore_attributes is not None: - if not isinstance(self.ignore_attributes, list): - raise TypeError("ignore_attributes should be a list") + if self.ignore_attribute is not None: + if not isinstance(self.ignore_attribute, list): + raise TypeError("ignore_attribute should be a list") if self.row_id_attribute is not None: if not isinstance(self.row_id_attribute, str): raise TypeError("row id attribute should be a str") @@ -607,8 +655,8 @@ def get_features_by_type(self, data_type, exclude=None, to_exclude = [] if exclude is not None: to_exclude.extend(exclude) - if exclude_ignore_attributes and self.ignore_attributes is not None: - to_exclude.extend(self.ignore_attributes) + if exclude_ignore_attribute and self.ignore_attribute is not None: + to_exclude.extend(self.ignore_attribute) if exclude_row_id_attribute and self.row_id_attribute is not None: to_exclude.append(self.row_id_attribute) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 30f58757c..1ed888ec1 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -165,6 +165,30 @@ def _get_cache_directory(dataset: OpenMLDataset) -> str: return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id) +def list_qualities() -> List[str]: + """ Return list of data qualities available. + + The function performs an API call to retrieve the entire list of + data qualities that are computed on the datasets uploaded. + + Returns + ------- + list + """ + api_call = "data/qualities/list" + xml_string = openml._api_calls._perform_api_call(api_call, 'get') + qualities = xmltodict.parse(xml_string, force_list=('oml:quality')) + # Minimalistic check if the XML is useful + if 'oml:data_qualities_list' not in qualities: + raise ValueError('Error in return XML, does not contain ' + '"oml:data_qualities_list"') + if not isinstance(qualities['oml:data_qualities_list']['oml:quality'], list): + raise TypeError('Error in return XML, does not contain ' + '"oml:quality" as a list') + qualities = qualities['oml:data_qualities_list']['oml:quality'] + return qualities + + def list_datasets( offset: Optional[int] = None, size: Optional[int] = None, @@ -277,10 +301,10 @@ def __list_datasets(api_call, output_format='dict'): datasets = dict() for dataset_ in datasets_dict['oml:data']['oml:dataset']: - ignore_attributes = ['oml:file_id', 'oml:quality'] + ignore_attribute = ['oml:file_id', 'oml:quality'] dataset = {k.replace('oml:', ''): v for (k, v) in dataset_.items() - if k not in ignore_attributes} + if k not in ignore_attribute} dataset['did'] = int(dataset['did']) dataset['version'] = int(dataset['version']) diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py index 650ba3502..43cec8738 100644 --- a/openml/evaluations/__init__.py +++ b/openml/evaluations/__init__.py @@ -1,4 +1,5 @@ from .evaluation import OpenMLEvaluation -from .functions import list_evaluations +from .functions import list_evaluations, list_evaluation_measures, list_evaluations_setups -__all__ = ['OpenMLEvaluation', 'list_evaluations'] +__all__ = ['OpenMLEvaluation', 'list_evaluations', 'list_evaluation_measures', + 'list_evaluations_setups'] diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py index a22b6598f..48b407575 100644 --- a/openml/evaluations/evaluation.py +++ b/openml/evaluations/evaluation.py @@ -1,3 +1,5 @@ +import openml.config + class OpenMLEvaluation(object): """ @@ -47,3 +49,32 @@ def __init__(self, run_id, task_id, setup_id, flow_id, flow_name, self.value = value self.values = values self.array_data = array_data + + def __repr__(self): + header = "OpenML Evaluation" + header = '{}\n{}\n'.format(header, '=' * len(header)) + + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"Upload Date": self.upload_time, + "Run ID": self.run_id, + "OpenML Run URL": "{}r/{}".format(base_url, self.run_id), + "Task ID": self.task_id, + "OpenML Task URL": "{}t/{}".format(base_url, self.task_id), + "Flow ID": self.flow_id, + "OpenML Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Setup ID": self.setup_id, + "Data ID": self.data_id, + "Data Name": self.data_name, + "OpenML Data URL": "{}d/{}".format(base_url, self.data_id), + "Metric Used": self.function, + "Result": self.value} + + order = ["Uploader Date", "Run ID", "OpenML Run URL", "Task ID", "OpenML Task URL" + "Flow ID", "OpenML Flow URL", "Setup ID", "Data ID", "Data Name", + "OpenML Data URL", "Metric Used", "Result"] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return header + body diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index 322168aa4..55517f3d6 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -1,11 +1,14 @@ import json import xmltodict import pandas as pd +import numpy as np from typing import Union, List, Optional, Dict +import collections import openml.utils import openml._api_calls from ..evaluations import OpenMLEvaluation +import openml def list_evaluations( @@ -19,6 +22,7 @@ def list_evaluations( uploader: Optional[List] = None, tag: Optional[str] = None, per_fold: Optional[bool] = None, + sort_order: Optional[str] = None, output_format: str = 'object' ) -> Union[Dict, pd.DataFrame]: """ @@ -48,6 +52,9 @@ def list_evaluations( per_fold : bool, optional + sort_order : str, optional + order of sorting evaluations, ascending ("asc") or descending ("desc") + output_format: str, optional (default='object') The parameter decides the format of the output. - If 'object' the output is a dict of OpenMLEvaluation objects @@ -77,6 +84,7 @@ def list_evaluations( flow=flow, uploader=uploader, tag=tag, + sort_order=sort_order, per_fold=per_fold_str) @@ -87,6 +95,7 @@ def _list_evaluations( setup: Optional[List] = None, flow: Optional[List] = None, uploader: Optional[List] = None, + sort_order: Optional[str] = None, output_format: str = 'object', **kwargs ) -> Union[Dict, pd.DataFrame]: @@ -114,6 +123,9 @@ def _list_evaluations( kwargs: dict, optional Legal filter operators: tag, limit, offset. + sort_order : str, optional + order of sorting evaluations, ascending ("asc") or descending ("desc") + output_format: str, optional (default='dict') The parameter decides the format of the output. - If 'dict' the output is a dict of dict @@ -141,6 +153,8 @@ def _list_evaluations( api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow]) if uploader is not None: api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader]) + if sort_order is not None: + api_call += "/sort_order/%s" % sort_order return __list_evaluations(api_call, output_format=output_format) @@ -157,7 +171,7 @@ def __list_evaluations(api_call, output_format='object'): assert type(evals_dict['oml:evaluations']['oml:evaluation']) == list, \ type(evals_dict['oml:evaluations']) - evals = dict() + evals = collections.OrderedDict() for eval_ in evals_dict['oml:evaluations']['oml:evaluation']: run_id = int(eval_['oml:run_id']) value = None @@ -197,6 +211,119 @@ def __list_evaluations(api_call, output_format='object'): 'array_data': array_data} if output_format == 'dataframe': - evals = pd.DataFrame.from_dict(evals, orient='index') - + rows = [value for key, value in evals.items()] + evals = pd.DataFrame.from_records(rows, columns=rows[0].keys()) return evals + + +def list_evaluation_measures() -> List[str]: + """ Return list of evaluation measures available. + + The function performs an API call to retrieve the entire list of + evaluation measures that are available. + + Returns + ------- + list + + """ + api_call = "evaluationmeasure/list" + xml_string = openml._api_calls._perform_api_call(api_call, 'get') + qualities = xmltodict.parse(xml_string, force_list=('oml:measures')) + # Minimalistic check if the XML is useful + if 'oml:evaluation_measures' not in qualities: + raise ValueError('Error in return XML, does not contain ' + '"oml:evaluation_measures"') + if not isinstance(qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure'], + list): + raise TypeError('Error in return XML, does not contain ' + '"oml:measure" as a list') + qualities = qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure'] + return qualities + + +def list_evaluations_setups( + function: str, + offset: Optional[int] = None, + size: Optional[int] = None, + id: Optional[List] = None, + task: Optional[List] = None, + setup: Optional[List] = None, + flow: Optional[List] = None, + uploader: Optional[List] = None, + tag: Optional[str] = None, + per_fold: Optional[bool] = None, + sort_order: Optional[str] = None, + output_format: str = 'dataframe' +) -> Union[Dict, pd.DataFrame]: + """ + List all run-evaluation pairs matching all of the given filters + and their hyperparameter settings. + + Parameters + ---------- + function : str + the evaluation function. e.g., predictive_accuracy + offset : int, optional + the number of runs to skip, starting from the first + size : int, optional + the maximum number of runs to show + id : list[int], optional + the list of evaluation ID's + task : list[int], optional + the list of task ID's + setup: list[int], optional + the list of setup ID's + flow : list[int], optional + the list of flow ID's + uploader : list[int], optional + the list of uploader ID's + tag : str, optional + filter evaluation based on given tag + per_fold : bool, optional + sort_order : str, optional + order of sorting evaluations, ascending ("asc") or descending ("desc") + output_format: str, optional (default='dataframe') + The parameter decides the format of the output. + - If 'dict' the output is a dict of dict + - If 'dataframe' the output is a pandas DataFrame + + + Returns + ------- + dict or dataframe with hyperparameter settings as a list of tuples. + """ + # List evaluations + evals = list_evaluations(function=function, offset=offset, size=size, id=id, task=task, + setup=setup, flow=flow, uploader=uploader, tag=tag, + per_fold=per_fold, sort_order=sort_order, output_format='dataframe') + + # List setups + # Split setups in evals into chunks of N setups as list_setups does not support large size + df = pd.DataFrame() + if len(evals) != 0: + N = 100 + setup_chunks = np.split(evals['setup_id'].unique(), + ((len(evals['setup_id'].unique()) - 1) // N) + 1) + setups = pd.DataFrame() + for setup in setup_chunks: + result = pd.DataFrame(openml.setups.list_setups(setup=setup, output_format='dataframe')) + result.drop('flow_id', axis=1, inplace=True) + # concat resulting setup chunks into single datframe + setups = pd.concat([setups, result], ignore_index=True) + parameters = [] + # Convert parameters of setup into list of tuples of (hyperparameter, value) + for parameter_dict in setups['parameters']: + if parameter_dict is not None: + parameters.append([tuple([param['parameter_name'], param['value']]) + for param in parameter_dict.values()]) + else: + parameters.append([]) + setups['parameters'] = parameters + # Merge setups with evaluations + df = pd.merge(evals, setups, on='setup_id', how='left') + + if output_format == 'dataframe': + return df + else: + return df.to_dict(orient='index') diff --git a/openml/exceptions.py b/openml/exceptions.py index 2bd52ca49..492587adc 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -25,7 +25,7 @@ def __init__(self, message: str, code: str = None, additional: str = None, url: self.url = url super().__init__(message) - def __str__(self): + def __repr__(self): return '%s returned code %s: %s' % ( self.url, self.code, self.message, ) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index ce8e4ebf9..d44b61ae7 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -87,6 +87,122 @@ def can_handle_model(cls, model: Any) -> bool: """ return isinstance(model, sklearn.base.BaseEstimator) + @classmethod + def trim_flow_name( + cls, + long_name: str, + extra_trim_length: int = 100, + _outer: bool = True + ) -> str: + """ Shorten generated sklearn flow name to at most `max_length` characters. + + Flows are assumed to have the following naming structure: + (model_selection)? (pipeline)? (steps)+ + and will be shortened to: + sklearn.(selection.)?(pipeline.)?(steps)+ + e.g. (white spaces and newlines added for readability) + sklearn.pipeline.Pipeline( + columntransformer=sklearn.compose._column_transformer.ColumnTransformer( + numeric=sklearn.pipeline.Pipeline( + imputer=sklearn.preprocessing.imputation.Imputer, + standardscaler=sklearn.preprocessing.data.StandardScaler), + nominal=sklearn.pipeline.Pipeline( + simpleimputer=sklearn.impute.SimpleImputer, + onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), + variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + svc=sklearn.svm.classes.SVC) + -> + sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC) + + Parameters + ---------- + long_name : str + The full flow name generated by the scikit-learn extension. + extra_trim_length: int (default=100) + If the trimmed name would exceed `extra_trim_length` characters, additional trimming + of the short name is performed. This reduces the produced short name length. + There is no guarantee the end result will not exceed `extra_trim_length`. + _outer : bool (default=True) + For internal use only. Specifies if the function is called recursively. + + Returns + ------- + str + + """ + def remove_all_in_parentheses(string: str) -> str: + string, removals = re.subn(r"\([^()]*\)", "", string) + while removals > 0: + string, removals = re.subn(r"\([^()]*\)", "", string) + return string + + # Generally, we want to trim all hyperparameters, the exception to that is for model + # selection, as the `estimator` hyperparameter is very indicative of what is in the flow. + # So we first trim name of the `estimator` specified in mode selection. For reference, in + # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and + # keep it in the final trimmed flow name: + # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer, + # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator= + # sklearn.tree.tree.DecisionTreeClassifier)) + if 'sklearn.model_selection' in long_name: + start_index = long_name.index('sklearn.model_selection') + estimator_start = (start_index + + long_name[start_index:].index('estimator=') + + len('estimator=')) + + model_select_boilerplate = long_name[start_index:estimator_start] + # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator=" + model_selection_class = model_select_boilerplate.split('(')[0].split('.')[-1] + + # Now we want to also find and parse the `estimator`, for this we find the closing + # parenthesis to the model selection technique: + closing_parenthesis_expected = 1 + for i, char in enumerate(long_name[estimator_start:], start=estimator_start): + if char == '(': + closing_parenthesis_expected += 1 + if char == ')': + closing_parenthesis_expected -= 1 + if closing_parenthesis_expected == 0: + break + + model_select_pipeline = long_name[estimator_start:i] + trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False) + _, trimmed_pipeline = trimmed_pipeline.split('.', maxsplit=1) # trim module prefix + model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline) + name = long_name[:start_index] + model_select_short + long_name[i + 1:] + else: + name = long_name + + module_name = long_name.split('.')[0] + short_name = module_name + '.{}' + + if name.startswith('sklearn.pipeline'): + full_pipeline_class, pipeline = name[:-1].split('(', maxsplit=1) + pipeline_class = full_pipeline_class.split('.')[-1] + # We don't want nested pipelines in the short name, so we trim all complicated + # subcomponents, i.e. those with parentheses: + pipeline = remove_all_in_parentheses(pipeline) + + # then the pipeline steps are formatted e.g.: + # step1name=sklearn.submodule.ClassName,step2name... + components = [component.split('.')[-1] for component in pipeline.split(',')] + pipeline = "{}({})".format(pipeline_class, ','.join(components)) + if len(short_name.format(pipeline)) > extra_trim_length: + pipeline = "{}(...,{})".format(pipeline_class, components[-1]) + else: + # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier + pipeline = remove_all_in_parentheses(name).split('.')[-1] + + if not _outer: + # Anything from parenthesis in inner calls should not be culled, so we use brackets + pipeline = pipeline.replace('(', '[').replace(')', ']') + else: + # Square brackets may be introduced with nested model_selection + pipeline = pipeline.replace('[', '(').replace(']', ')') + + return short_name.format(pipeline) + ################################################################################################ # Methods for flow serialization and de-serialization @@ -402,6 +518,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: name = '%s(%s)' % (class_name, sub_components_names[1:]) else: name = class_name + short_name = SklearnExtension.trim_flow_name(name) # Get the external versions of all sub-components external_version = self._get_external_version_string(model, subcomponents) @@ -419,6 +536,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: sklearn_version_formatted = sklearn_version.replace('==', '_') flow = OpenMLFlow(name=name, class_name=class_name, + custom_name=short_name, description='Automatically created scikit-learn flow.', model=model, components=subcomponents, @@ -432,6 +550,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: # annotate a class of sklearn.svm.SVC() with the # tag svm? ], + extension=self, language='English', # TODO fill in dependencies! dependencies=dependencies) @@ -455,9 +574,12 @@ def _get_external_version_string( model_package_name, model_package_version_number, ) openml_version = self._format_external_version('openml', openml.__version__) + sklearn_version = self._format_external_version('sklearn', sklearn.__version__) + external_versions = set() external_versions.add(external_version) external_versions.add(openml_version) + external_versions.add(sklearn_version) for visitee in sub_components.values(): for external_version in visitee.external_version.split(','): external_versions.add(external_version) diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 829bc0745..0db69d16f 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -7,6 +7,8 @@ from ..extensions import get_extension_by_flow from ..utils import extract_xml_tags, _tag_entity +import openml.config + class OpenMLFlow(object): """OpenML Flow. Stores machine learning models. @@ -85,7 +87,7 @@ def __init__(self, name, description, model, components, parameters, dependencies, class_name=None, custom_name=None, binary_url=None, binary_format=None, binary_md5=None, uploader=None, upload_date=None, - flow_id=None, version=None): + flow_id=None, extension=None, version=None): self.name = name self.description = description self.model = model @@ -129,8 +131,47 @@ def __init__(self, name, description, model, components, parameters, self.language = language self.dependencies = dependencies self.flow_id = flow_id + if extension is None: + self._extension = get_extension_by_flow(self) + else: + self._extension = extension - self.extension = get_extension_by_flow(self) + @property + def extension(self): + if self._extension is not None: + return self._extension + else: + raise RuntimeError("No extension could be found for flow {}: {}" + .format(self.flow_id, self.name)) + + def __repr__(self): + header = "OpenML Flow" + header = '{}\n{}\n'.format(header, '=' * len(header)) + + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"Flow Name": self.name, + "Flow Description": self.description, + "Dependencies": self.dependencies} + if self.flow_id is not None: + if self.version is not None: + fields["Flow ID"] = "{} (version {})".format(self.flow_id, self.version) + else: + fields["Flow ID"] = self.flow_id + fields["Flow URL"] = "{}f/{}".format(base_url, self.flow_id) + if self.upload_date is not None: + fields["Upload Date"] = self.upload_date.replace('T', ' ') + if self.binary_url is not None: + fields["Binary URL"] = self.binary_url + + # determines the order in which the information will be printed + order = ["Flow ID", "Flow URL", "Flow Name", "Flow Description", "Binary URL", + "Upload Date", "Dependencies"] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return header + body def _to_xml(self) -> str: """Generate xml representation of self for upload to server. @@ -378,14 +419,15 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow': _copy_server_fields(flow, self) try: openml.flows.functions.assert_flows_equal( - self, flow, flow.upload_date, ignore_parameter_values=True + self, flow, flow.upload_date, + ignore_parameter_values=True, + ignore_custom_name_if_none=True ) except ValueError as e: message = e.args[0] - raise ValueError("Flow was not stored correctly on the server. " - "New flow ID is %d. Please check manually and " - "remove the flow if necessary! Error is:\n'%s'" % - (flow_id, message)) + raise ValueError("The flow on the server is inconsistent with the local flow. " + "The server flow ID is {}. Please check manually and remove " + "the flow if necessary! Error is:\n'{}'".format(flow_id, message)) return self def get_structure(self, key_item: str) -> Dict[str, List[str]]: diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 5841dc699..d12bcfe91 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -92,7 +92,6 @@ def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow: if reinstantiate: flow.model = flow.extension.flow_to_model(flow) - return flow @@ -308,7 +307,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None: def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values_on_older_children: str = None, - ignore_parameter_values: bool = False) -> None: + ignore_parameter_values: bool = False, + ignore_custom_name_if_none: bool = False) -> None: """Check equality of two flows. Two flows are equal if their all keys which are not set by the server @@ -326,6 +326,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values : bool Whether to ignore parameter values when comparing flows. + + ignore_custom_name_if_none : bool + Whether to ignore the custom name field if either flow has `custom_name` equal to `None`. """ if not isinstance(flow1, OpenMLFlow): raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' % @@ -359,8 +362,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, 'argument2, but not in argument1.' % name) assert_flows_equal(attr1[name], attr2[name], ignore_parameter_values_on_older_children, - ignore_parameter_values) - elif key == 'extension': + ignore_parameter_values, + ignore_custom_name_if_none) + elif key == '_extension': continue else: if key == 'parameters': @@ -386,6 +390,13 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, # Continue needs to be done here as the first if # statement triggers in both special cases continue + elif (key == 'custom_name' + and ignore_custom_name_if_none + and (attr1 is None or attr2 is None)): + # If specified, we allow `custom_name` inequality if one flow's name is None. + # Helps with backwards compatibility as `custom_name` is now auto-generated, but + # before it used to be `None`. + continue if attr1 != attr2: raise ValueError("Flow %s: values for attribute '%s' differ: " diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 87596deca..767a4a48a 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -78,22 +78,22 @@ def run_model_on_task( Flow generated from the model. """ - extension = get_extension_by_model(model, raise_if_no_extension=True) - if extension is None: - # This should never happen and is only here to please mypy will be gone soon once the - # whole function is removed - raise TypeError(extension) - # TODO: At some point in the future do not allow for arguments in old order (6-2018). # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019). # When removing this please also remove the method `is_estimator` from the extension # interface as it is only used here (MF, 3-2019) - if isinstance(model, OpenMLTask) and extension.is_estimator(model): + if isinstance(model, OpenMLTask): warnings.warn("The old argument order (task, model) is deprecated and " "will not be supported in the future. Please use the " "order (model, task).", DeprecationWarning) task, model = model, task + extension = get_extension_by_model(model, raise_if_no_extension=True) + if extension is None: + # This should never happen and is only here to please mypy will be gone soon once the + # whole function is removed + raise TypeError(extension) + flow = extension.model_to_flow(model) run = run_flow_on_task( @@ -159,9 +159,6 @@ def run_flow_on_task( if flow_tags is not None and not isinstance(flow_tags, list): raise ValueError("flow_tags should be a list") - if task.task_id is None: - raise ValueError("The task should be published at OpenML") - # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018). # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019). if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow): @@ -171,6 +168,11 @@ def run_flow_on_task( "order (model, Flow).", DeprecationWarning) task, flow = flow, task + if task.task_id is None: + raise ValueError("The task should be published at OpenML") + + if flow.model is None: + flow.model = flow.extension.flow_to_model(flow) flow.model = flow.extension.seed_model(flow.model, seed=seed) # We only need to sync with the server right now if we want to upload the flow, @@ -667,6 +669,13 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): dataset_id = int(run['oml:input_data']['oml:dataset']['oml:did']) elif not from_server: dataset_id = None + else: + # fetching the task to obtain dataset_id + t = openml.tasks.get_task(task_id, download_data=False) + if not hasattr(t, 'dataset_id'): + raise ValueError("Unable to fetch dataset_id from the task({}) " + "linked to run({})".format(task_id, run_id)) + dataset_id = t.dataset_id files = OrderedDict() evaluations = OrderedDict() diff --git a/openml/runs/run.py b/openml/runs/run.py index 0e5e12b9b..6a4818f30 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -67,13 +67,41 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None, self.tags = tags self.predictions_url = predictions_url - def __str__(self): - flow_name = self.flow_name - if flow_name is not None and len(flow_name) > 26: - # long enough to show sklearn.pipeline.Pipeline - flow_name = flow_name[:26] + "..." - return "[run id: {}, task id: {}, flow id: {}, flow name: {}]".format( - self.run_id, self.task_id, self.flow_id, flow_name) + def __repr__(self): + header = "OpenML Run" + header = '{}\n{}\n'.format(header, '=' * len(header)) + + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"Uploader Name": self.uploader_name, + "Metric": self.task_evaluation_measure, + "Run ID": self.run_id, + "Task ID": self.task_id, + "Task Type": self.task_type, + "Task URL": "{}t/{}".format(base_url, self.task_id), + "Flow ID": self.flow_id, + "Flow Name": self.flow_name, + "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Setup ID": self.setup_id, + "Setup String": self.setup_string, + "Dataset ID": self.dataset_id, + "Dataset URL": "{}d/{}".format(base_url, self.dataset_id)} + if self.uploader is not None: + fields["Uploader Profile"] = "{}u/{}".format(base_url, self.uploader) + if self.run_id is not None: + fields["Run URL"] = "{}r/{}".format(base_url, self.run_id) + if self.evaluations is not None and self.task_evaluation_measure in self.evaluations: + fields["Result"] = self.evaluations[self.task_evaluation_measure] + + # determines the order in which the information will be printed + order = ["Uploader Name", "Uploader Profile", "Metric", "Result", "Run ID", "Run URL", + "Task ID", "Task Type", "Task URL", "Flow ID", "Flow Name", "Flow URL", + "Setup ID", "Setup String", "Dataset ID", "Dataset URL"] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return header + body def _repr_pretty_(self, pp, cycle): pp.text(str(self)) diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 42e89c50b..1786120e8 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -380,7 +380,7 @@ def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace': return cls(None, merged_trace) - def __str__(self): + def __repr__(self): return '[Run id: %d, %d trace iterations]'.format( -1 if self.run_id is None else self.run_id, len(self.trace_iterations), @@ -471,7 +471,7 @@ def get_parameters(self): result[param[len(PREFIX):]] = value return result - def __str__(self): + def __repr__(self): """ tmp string representation, will be changed in the near future """ diff --git a/openml/setups/setup.py b/openml/setups/setup.py index 91e921b55..aee1aa0bf 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -1,15 +1,17 @@ +import openml.config + class OpenMLSetup(object): """Setup object (a.k.a. Configuration). - Parameters - ---------- - setup_id : int - The OpenML setup id - flow_id : int - The flow that it is build upon - parameters : dict - The setting of the parameters + Parameters + ---------- + setup_id : int + The OpenML setup id + flow_id : int + The flow that it is build upon + parameters : dict + The setting of the parameters """ def __init__(self, setup_id, flow_id, parameters): @@ -25,6 +27,25 @@ def __init__(self, setup_id, flow_id, parameters): self.flow_id = flow_id self.parameters = parameters + def __repr__(self): + header = "OpenML Setup" + header = '{}\n{}\n'.format(header, '=' * len(header)) + + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"Setup ID": self.setup_id, + "Flow ID": self.flow_id, + "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "# of Parameters": len(self.parameters)} + + # determines the order in which the information will be printed + order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return header + body + class OpenMLParameter(object): """Parameter object (used in setup). @@ -60,3 +81,34 @@ def __init__(self, input_id, flow_id, flow_name, full_name, parameter_name, self.data_type = data_type self.default_value = default_value self.value = value + + def __repr__(self): + header = "OpenML Parameter" + header = '{}\n{}\n'.format(header, '=' * len(header)) + + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"ID": self.id, + "Flow ID": self.flow_id, + # "Flow Name": self.flow_name, + "Flow Name": self.full_name, + "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Parameter Name": self.parameter_name} + # indented prints for parameter attributes + # indention = 2 spaces + 1 | + 2 underscores + indent = "{}|{}".format(" " * 2, "_" * 2) + parameter_data_type = "{}Data Type".format(indent) + fields[parameter_data_type] = self.data_type + parameter_default = "{}Default".format(indent) + fields[parameter_default] = self.default_value + parameter_value = "{}Value".format(indent) + fields[parameter_value] = self.value + + # determines the order in which the information will be printed + order = ["ID", "Flow ID", "Flow Name", "Flow URL", "Parameter Name", + parameter_data_type, parameter_default, parameter_value] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return header + body diff --git a/openml/study/functions.py b/openml/study/functions.py index 0e2f9eb3f..ccd523016 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -182,8 +182,8 @@ def create_study( where the runs are the main entity (collection consists of runs and all entities (flows, tasks, etc) that are related to these runs) - Parameters: - ----------- + Parameters + ---------- alias : str (optional) a string ID, unique on server (url-friendly) benchmark_suite : int (optional) @@ -195,8 +195,8 @@ def create_study( run_ids : list a list of run ids associated with this study - Returns: - -------- + Returns + ------- OpenMLStudy A local OpenML study object (call publish method to upload to server) """ @@ -228,8 +228,8 @@ def create_benchmark_suite( Creates an OpenML benchmark suite (collection of entity types, where the tasks are the linked entity) - Parameters: - ----------- + Parameters + ---------- alias : str (optional) a string ID, unique on server (url-friendly) name : str @@ -239,8 +239,8 @@ def create_benchmark_suite( task_ids : list a list of task ids associated with this study - Returns: - -------- + Returns + ------- OpenMLStudy A local OpenML study object (call publish method to upload to server) """ diff --git a/openml/study/study.py b/openml/study/study.py index 46f1339eb..8657749da 100644 --- a/openml/study/study.py +++ b/openml/study/study.py @@ -89,6 +89,39 @@ def __init__( self.runs = runs pass + def __repr__(self): + # header is provided by the sub classes + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"Name": self.name, + "Status": self.status, + "Main Entity Type": self.main_entity_type} + if self.id is not None: + fields["ID"] = self.id + fields["Study URL"] = "{}s/{}".format(base_url, self.id) + if self.creator is not None: + fields["Creator"] = "{}u/{}".format(base_url, self.creator) + if self.creation_date is not None: + fields["Upload Time"] = self.creation_date.replace('T', ' ') + if self.data is not None: + fields["# of Data"] = len(self.data) + if self.tasks is not None: + fields["# of Tasks"] = len(self.tasks) + if self.flows is not None: + fields["# of Flows"] = len(self.flows) + if self.runs is not None: + fields["# of Runs"] = len(self.runs) + + # determines the order in which the information will be printed + order = ["ID", "Name", "Status", "Main Entity Type", "Study URL", + "# of Data", "# of Tasks", "# of Flows", "# of Runs", + "Creator", "Upload Time"] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return body + def publish(self) -> int: """ Publish the study on the OpenML server. @@ -235,6 +268,12 @@ def __init__( setups=setups, ) + def __repr__(self): + header = "OpenML Study" + header = '{}\n{}\n'.format(header, '=' * len(header)) + body = super(OpenMLStudy, self).__repr__() + return header + body + class OpenMLBenchmarkSuite(BaseStudy): """ @@ -306,3 +345,9 @@ def __init__( runs=None, setups=None, ) + + def __repr__(self): + header = "OpenML Benchmark Suite" + header = '{}\n{}\n'.format(header, '=' * len(header)) + body = super(OpenMLBenchmarkSuite, self).__repr__() + return header + body diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 69850a096..4bb93b007 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -133,14 +133,14 @@ def list_tasks( ) -> Union[Dict, pd.DataFrame]: """ Return a number of tasks having the given tag and task_type_id + Parameters ---------- Filter task_type_id is separated from the other filters because it is used as task_type_id in the task description, but it is named type when used as a filter in list tasks call. task_type_id : int, optional - ID of the task type as detailed - `here `_. + ID of the task type as detailed `here `_. - Supervised classification: 1 - Supervised regression: 2 - Learning curve: 3 @@ -362,7 +362,7 @@ def get_task(task_id: int, download_data: bool = True) -> OpenMLTask: # List of class labels availaible in dataset description # Including class labels as part of task meta data handles # the case where data download was initially disabled - if isinstance(task, OpenMLClassificationTask): + if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): task.class_labels = \ dataset.retrieve_class_labels(task.target_name) # Clustering tasks do not have class labels diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 6e0154726..83af79373 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -55,6 +55,36 @@ def __init__( self.estimation_procedure_id = estimation_procedure_id self.split = None # type: Optional[OpenMLSplit] + def __repr__(self): + header = "OpenML Task" + header = '{}\n{}\n'.format(header, '=' * len(header)) + + base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + fields = {"Task Type": self.task_type} + if self.task_id is not None: + fields["Task ID"] = self.task_id + fields["Task URL"] = "{}t/{}".format(base_url, self.task_id) + if self.evaluation_measure is not None: + fields["Evaluation Measure"] = self.evaluation_measure + if self.estimation_procedure is not None: + fields["Estimation Procedure"] = self.estimation_procedure['type'] + if self.target_name is not None: + fields["Target Feature"] = self.target_name + if hasattr(self, 'class_labels'): + fields["# of Classes"] = len(self.class_labels) + if hasattr(self, 'cost_matrix'): + fields["Cost Matrix"] = "Available" + + # determines the order in which the information will be printed + order = ["Task Type", "Task ID", "Task URL", "Estimation Procedure", "Evaluation Measure", + "Target Feature", "# of Classes", "Cost Matrix"] + fields = [(key, fields[key]) for key in order if key in fields] + + longest_field_name_length = max(len(name) for name, value in fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in fields) + return header + body + def get_dataset(self) -> datasets.OpenMLDataset: """Download dataset associated with task""" return datasets.get_dataset(self.dataset_id) diff --git a/openml/testing.py b/openml/testing.py index 1ce0862d0..370fb9102 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -17,6 +17,8 @@ import openml from openml.tasks import TaskTypeEnum +import logging + class TestBase(unittest.TestCase): """Base class for tests @@ -26,6 +28,15 @@ class TestBase(unittest.TestCase): Currently hard-codes a read-write key. Hopefully soon allows using a test server, not the production server. """ + publish_tracker = {'run': [], 'data': [], 'flow': [], 'task': [], + 'study': [], 'user': []} # type: dict + test_server = "https://test.openml.org/api/v1/xml" + # amueller's read/write key that he will throw away later + apikey = "610344db6388d9ba34f6db45a3cf71de" + + # creating logger for tracking files uploaded to test server + logger = logging.getLogger("unit_tests_published_entities") + logger.setLevel(logging.DEBUG) def setUp(self, n_levels: int = 1): """Setup variables and temporary directories. @@ -58,7 +69,9 @@ def setUp(self, n_levels: int = 1): self.static_cache_dir = os.path.join(static_cache_dir, 'files') if self.static_cache_dir is None: - raise ValueError('Cannot find test cache dir!') + raise ValueError( + 'Cannot find test cache dir, expected it to be {}!'.format(static_cache_dir) + ) self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) @@ -70,12 +83,9 @@ def setUp(self, n_levels: int = 1): os.chdir(self.workdir) self.cached = True - # amueller's read/write key that he will throw away later - openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" + openml.config.apikey = TestBase.apikey self.production_server = "https://openml.org/api/v1/xml" - self.test_server = "https://test.openml.org/api/v1/xml" - - openml.config.server = self.test_server + openml.config.server = TestBase.test_server openml.config.avoid_duplicate_runs = False openml.config.cache_directory = self.workdir @@ -86,7 +96,7 @@ def setUp(self, n_levels: int = 1): with open(openml.config.config_file, 'w') as fh: fh.write('apikey = %s' % openml.config.apikey) - # Increase the number of retries to avoid spurios server failures + # Increase the number of retries to avoid spurious server failures self.connection_n_retries = openml.config.connection_n_retries openml.config.connection_n_retries = 10 @@ -103,6 +113,40 @@ def tearDown(self): openml.config.server = self.production_server openml.config.connection_n_retries = self.connection_n_retries + @classmethod + def _mark_entity_for_removal(self, entity_type, entity_id): + """ Static record of entities uploaded to test server + + Dictionary of lists where the keys are 'entity_type'. + Each such dictionary is a list of integer IDs. + For entity_type='flow', each list element is a tuple + of the form (Flow ID, Flow Name). + """ + if entity_type not in TestBase.publish_tracker: + TestBase.publish_tracker[entity_type] = [entity_id] + else: + TestBase.publish_tracker[entity_type].append(entity_id) + + @classmethod + def _delete_entity_from_tracker(self, entity_type, entity): + """ Deletes entity records from the static file_tracker + + Given an entity type and corresponding ID, deletes all entries, including + duplicate entries of the ID for the entity type. + """ + if entity_type in TestBase.publish_tracker: + # removes duplicate entries + TestBase.publish_tracker[entity_type] = list(set(TestBase.publish_tracker[entity_type])) + if entity_type == 'flow': + delete_index = [i for i, (id_, _) in + enumerate(TestBase.publish_tracker[entity_type]) + if id_ == entity][0] + else: + delete_index = [i for i, id_ in + enumerate(TestBase.publish_tracker[entity_type]) + if id_ == entity][0] + TestBase.publish_tracker[entity_type].pop(delete_index) + def _get_sentinel(self, sentinel=None): if sentinel is None: # Create a unique prefix for the flow. Necessary because the flow @@ -197,4 +241,10 @@ def _check_fold_timing_evaluations( self.assertLessEqual(evaluation, max_val) -__all__ = ['TestBase'] +try: + from sklearn.impute import SimpleImputer +except ImportError: + from sklearn.preprocessing import Imputer as SimpleImputer + + +__all__ = ['TestBase', 'SimpleImputer'] diff --git a/openml/utils.py b/openml/utils.py index 54064aca5..f6cc81ff7 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -5,6 +5,7 @@ import warnings import pandas as pd from functools import wraps +import collections import openml._api_calls import openml.exceptions @@ -182,7 +183,7 @@ def _list_all(listing_call, output_format='dict', *args, **filters): active_filters = {key: value for key, value in filters.items() if value is not None} page = 0 - result = {} + result = collections.OrderedDict() if output_format == 'dataframe': result = pd.DataFrame() diff --git a/setup.cfg b/setup.cfg index fac02f0f9..726c8fa73 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,17 +1,6 @@ [metadata] description-file = README.md -[nosetests] -# nosetests skips test files with the executable bit by default -# which can silently hide failing tests. -exe = 1 -cover-html = 1 -cover-html-dir = coverage -cover-package = openml - -detailed-errors = 1 -with-doctest = 1 -doctest-tests = 1 -doctest-extension = rst -doctest-fixtures = _fixture -#doctest-options = +ELLIPSIS,+NORMALIZE_WHITESPACE +[tool:pytest] +filterwarnings = + ignore:the matrix subclass:PendingDeprecationWarning diff --git a/setup.py b/setup.py index ae676eaf8..3b271badd 100644 --- a/setup.py +++ b/setup.py @@ -6,13 +6,6 @@ with open("openml/__version__.py") as fh: version = fh.readlines()[-1].split()[-1].strip("\"'") -# Using Python setup.py install will try to build numpy which is prone to failure and -# very time consuming anyway. -if len(sys.argv) > 1 and sys.argv[1] == 'install': - print('Please install this package with pip: `pip install -e .` ' - 'Installation requires pip>=10.0.') - sys.exit(1) - if sys.version_info < (3, 5): raise ValueError( 'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.5 or higher.' @@ -35,6 +28,7 @@ version=version, packages=setuptools.find_packages(), package_data={'': ['*.txt', '*.md']}, + python_requires=">=3.5", install_requires=[ 'liac-arff>=2.4.0', 'xmltodict', @@ -79,7 +73,6 @@ 'Operating System :: Unix', 'Operating System :: MacOS', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7']) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..9e08d09a8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,181 @@ +'''This file is recognized by pytest for defining specified behaviour + +'conftest.py' files are directory-scope files that are shared by all +sub-directories from where this file is placed. pytest recognises +'conftest.py' for any unit test executed from within this directory +tree. This file is used to define fixtures, hooks, plugins, and other +functionality that can be shared by the unit tests. + +This file has been created for the OpenML testing to primarily make use +of the pytest hooks 'pytest_sessionstart' and 'pytest_sessionfinish', +which are being used for managing the deletion of local and remote files +created by the unit tests, run across more than one process. + +This design allows one to comment or remove the conftest.py file to +disable file deletions, without editing any of the test case files. + + +Possible Future: class TestBase from openml/testing.py can be included + under this file and there would not be any requirements to import + testing.py in each of the unit test modules. +''' + +import os +import logging +from typing import List + +import openml +from openml.testing import TestBase + +# creating logger for unit test file deletion status +logger = logging.getLogger("unit_tests") +logger.setLevel(logging.DEBUG) + +file_list = [] +directory = None + +# finding the root directory of conftest.py and going up to OpenML main directory +# exploiting the fact that conftest.py always resides in the root directory for tests +static_dir = os.path.dirname(os.path.abspath(__file__)) +logging.info("static directory: {}".format(static_dir)) +print("static directory: {}".format(static_dir)) +while True: + if 'openml' in os.listdir(static_dir): + break + static_dir = os.path.join(static_dir, '..') + + +def worker_id() -> str: + ''' Returns the name of the worker process owning this function call. + + :return: str + Possible outputs from the set of {'master', 'gw0', 'gw1', ..., 'gw(n-1)'} + where n is the number of workers being used by pytest-xdist + ''' + vars_ = list(os.environ.keys()) + if 'PYTEST_XDIST_WORKER' in vars_ or 'PYTEST_XDIST_WORKER_COUNT' in vars_: + return os.environ['PYTEST_XDIST_WORKER'] + else: + return 'master' + + +def read_file_list() -> List[str]: + '''Returns a list of paths to all files that currently exist in 'openml/tests/files/' + + :return: List[str] + ''' + directory = os.path.join(static_dir, 'tests/files/') + if worker_id() == 'master': + logger.info("Collecting file lists from: {}".format(directory)) + files = os.walk(directory) + file_list = [] + for root, _, filenames in files: + for filename in filenames: + file_list.append(os.path.join(root, filename)) + return file_list + + +def compare_delete_files(old_list, new_list) -> None: + '''Deletes files that are there in the new_list but not in the old_list + + :param old_list: List[str] + :param new_list: List[str] + :return: None + ''' + file_list = list(set(new_list) - set(old_list)) + for file in file_list: + os.remove(file) + logger.info("Deleted from local: {}".format(file)) + + +def delete_remote_files(tracker) -> None: + '''Function that deletes the entities passed as input, from the OpenML test server + + The TestBase class in openml/testing.py has an attribute called publish_tracker. + This function expects the dictionary of the same structure. + It is a dictionary of lists, where the keys are entity types, while the values are + lists of integer IDs, except for key 'flow' where the value is a tuple (ID, flow name). + + Iteratively, multiple POST requests are made to the OpenML test server using + openml.utils._delete_entity() to remove the entities uploaded by all the unit tests. + + :param tracker: Dict + :return: None + ''' + openml.config.server = TestBase.test_server + openml.config.apikey = TestBase.apikey + + # reordering to delete sub flows at the end of flows + # sub-flows have shorter names, hence, sorting by descending order of flow name length + if 'flow' in tracker: + flow_deletion_order = [entity_id for entity_id, _ in + sorted(tracker['flow'], key=lambda x: len(x[1]), reverse=True)] + tracker['flow'] = flow_deletion_order + + # deleting all collected entities published to test server + # 'run's are deleted first to prevent dependency issue of entities on deletion + logger.info("Entity Types: {}".format(['run', 'data', 'flow', 'task', 'study'])) + for entity_type in ['run', 'data', 'flow', 'task', 'study']: + logger.info("Deleting {}s...".format(entity_type)) + for i, entity in enumerate(tracker[entity_type]): + try: + openml.utils._delete_entity(entity_type, entity) + logger.info("Deleted ({}, {})".format(entity_type, entity)) + except Exception as e: + logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e)) + + +def pytest_sessionstart() -> None: + '''pytest hook that is executed before any unit test starts + + This function will be called by each of the worker processes, along with the master process + when they are spawned. This happens even before the collection of unit tests. + If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this + function, before execution of any unit test begins. The master pytest process has the name + 'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1. + The order of process spawning is: 'master' -> random ordering of the 'gw{i}' workers. + + Since, master is always executed first, it is checked if the current process is 'master' and + store a list of strings of paths of all files in the directory (pre-unit test snapshot). + + :return: None + ''' + # file_list is global to maintain the directory snapshot during tear down + global file_list + worker = worker_id() + if worker == 'master': + file_list = read_file_list() + + +def pytest_sessionfinish() -> None: + '''pytest hook that is executed after all unit tests of a worker ends + + This function will be called by each of the worker processes, along with the master process + when they are done with the unit tests allocated to them. + If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this + function, before execution of any unit test begins. The master pytest process has the name + 'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1. + The order of invocation is: random ordering of the 'gw{i}' workers -> 'master'. + + Since, master is always executed last, it is checked if the current process is 'master' and, + * Compares file list with pre-unit test snapshot and deletes all local files generated + * Iterates over the list of entities uploaded to test server and deletes them remotely + + :return: None + ''' + # allows access to the file_list read in the set up phase + global file_list + worker = worker_id() + logger.info("Finishing worker {}".format(worker)) + + # Test file deletion + logger.info("Deleting files uploaded to test server for worker {}".format(worker)) + delete_remote_files(TestBase.publish_tracker) + + if worker == 'master': + # Local file deletion + new_file_list = read_file_list() + compare_delete_files(file_list, new_file_list) + logger.info("Local files deleted") + + logging.info("{} is killed".format(worker)) diff --git a/tests/files/org/openml/test/datasets/-1/features.xml b/tests/files/org/openml/test/datasets/-1/features.xml index d46f635c1..01adbf5a8 100644 --- a/tests/files/org/openml/test/datasets/-1/features.xml +++ b/tests/files/org/openml/test/datasets/-1/features.xml @@ -180003,6 +180003,8 @@ 20000 class nominal + -1 + 1 false false false diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 5f4f9806d..cabad9565 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -141,7 +141,7 @@ def test_get_data_with_target_pandas(self): self.assertNotIn("class", attribute_names) def test_get_data_rowid_and_ignore_and_target(self): - self.dataset.ignore_attributes = ["condition"] + self.dataset.ignore_attribute = ["condition"] self.dataset.row_id_attribute = ["hardness"] X, y, categorical, names = self.dataset.get_data(target="class") self.assertEqual(X.shape, (898, 36)) @@ -151,15 +151,15 @@ def test_get_data_rowid_and_ignore_and_target(self): self.assertEqual(y.shape, (898, )) def test_get_data_with_ignore_attributes(self): - self.dataset.ignore_attributes = ["condition"] - rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=True) + self.dataset.ignore_attribute = ["condition"] + rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True) for (dtype, is_cat) in zip(rval.dtypes, categorical): expected_type = 'category' if is_cat else 'float64' self.assertEqual(dtype.name, expected_type) self.assertEqual(rval.shape, (898, 39)) self.assertEqual(len(categorical), 39) - rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=False) + rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False) for (dtype, is_cat) in zip(rval.dtypes, categorical): expected_type = 'category' if is_cat else 'float64' self.assertEqual(dtype.name, expected_type) @@ -271,9 +271,9 @@ def test_get_sparse_dataset_with_rowid(self): self.assertEqual(len(categorical), 20000) def test_get_sparse_dataset_with_ignore_attributes(self): - self.sparse_dataset.ignore_attributes = ["V256"] + self.sparse_dataset.ignore_attribute = ["V256"] rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format='array', include_ignore_attributes=True + dataset_format='array', include_ignore_attribute=True ) self.assertTrue(sparse.issparse(rval)) self.assertEqual(rval.dtype, np.float32) @@ -281,7 +281,7 @@ def test_get_sparse_dataset_with_ignore_attributes(self): self.assertEqual(len(categorical), 20001) rval, _, categorical, _ = self.sparse_dataset.get_data( - dataset_format='array', include_ignore_attributes=False + dataset_format='array', include_ignore_attribute=False ) self.assertTrue(sparse.issparse(rval)) self.assertEqual(rval.dtype, np.float32) @@ -290,13 +290,13 @@ def test_get_sparse_dataset_with_ignore_attributes(self): def test_get_sparse_dataset_rowid_and_ignore_and_target(self): # TODO: re-add row_id and ignore attributes - self.sparse_dataset.ignore_attributes = ["V256"] + self.sparse_dataset.ignore_attribute = ["V256"] self.sparse_dataset.row_id_attribute = ["V512"] X, y, categorical, _ = self.sparse_dataset.get_data( dataset_format='array', target="class", include_row_id=False, - include_ignore_attributes=False, + include_ignore_attribute=False, ) self.assertTrue(sparse.issparse(X)) self.assertEqual(X.dtype, np.float32) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 0b2620485..5726d2442 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -4,6 +4,7 @@ from unittest import mock import arff +import time import pytest import numpy as np @@ -43,14 +44,14 @@ def tearDown(self): super(TestOpenMLDataset, self).tearDown() def _remove_pickle_files(self): - cache_dir = self.static_cache_dir + self.lock_path = os.path.join(openml.config.get_cache_directory(), 'locks') for did in ['-1', '2']: with lockutils.external_lock( name='datasets.functions.get_dataset:%s' % did, - lock_path=os.path.join(openml.config.get_cache_directory(), 'locks'), + lock_path=self.lock_path, ): - pickle_path = os.path.join(cache_dir, 'datasets', did, - 'dataset.pkl') + pickle_path = os.path.join(openml.config.get_cache_directory(), 'datasets', + did, 'dataset.pkl.py3') try: os.remove(pickle_path) except (OSError, FileNotFoundError): @@ -478,6 +479,9 @@ def test_publish_dataset(self): data_file=file_path, ) dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.dataset_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + dataset.dataset_id)) self.assertIsInstance(dataset.dataset_id, int) def test__retrieve_class_labels(self): @@ -498,6 +502,9 @@ def test_upload_dataset_with_url(self): url="https://www.openml.org/data/download/61/dataset_61_iris.arff", ) dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.dataset_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + dataset.dataset_id)) self.assertIsInstance(dataset.dataset_id, int) def test_data_status(self): @@ -507,6 +514,9 @@ def test_data_status(self): version=1, url="https://www.openml.org/data/download/61/dataset_61_iris.arff") dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.dataset_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + dataset.dataset_id)) did = dataset.dataset_id # admin key for test server (only adminds can activate datasets. @@ -620,6 +630,9 @@ def test_create_dataset_numpy(self): ) upload_did = dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) self.assertEqual( _get_online_dataset_arff(upload_did), @@ -682,6 +695,9 @@ def test_create_dataset_list(self): ) upload_did = dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, @@ -725,6 +741,9 @@ def test_create_dataset_sparse(self): ) upload_did = xor_dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, @@ -762,6 +781,9 @@ def test_create_dataset_sparse(self): ) upload_did = xor_dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, @@ -885,6 +907,9 @@ def test_create_dataset_pandas(self): paper_url=paper_url ) upload_did = dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, @@ -919,6 +944,9 @@ def test_create_dataset_pandas(self): paper_url=paper_url ) upload_did = dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, @@ -955,6 +983,9 @@ def test_create_dataset_pandas(self): paper_url=paper_url ) upload_did = dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) downloaded_data = _get_online_dataset_arff(upload_did) self.assertEqual( downloaded_data, @@ -1012,9 +1043,10 @@ def test_ignore_attributes_dataset(self): original_data_url=original_data_url, paper_url=paper_url ) - self.assertEqual(dataset.ignore_attributes, ['outlook']) + self.assertEqual(dataset.ignore_attribute, ['outlook']) # pass a list to ignore_attribute + ignore_attribute = ['outlook', 'windy'] dataset = openml.datasets.functions.create_dataset( name=name, description=description, @@ -1025,7 +1057,7 @@ def test_ignore_attributes_dataset(self): licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, - ignore_attribute=['outlook', 'windy'], + ignore_attribute=ignore_attribute, citation=citation, attributes='auto', data=df, @@ -1033,7 +1065,7 @@ def test_ignore_attributes_dataset(self): original_data_url=original_data_url, paper_url=paper_url ) - self.assertEqual(dataset.ignore_attributes, ['outlook', 'windy']) + self.assertEqual(dataset.ignore_attribute, ignore_attribute) # raise an error if unknown type err_msg = 'Wrong data type for ignore_attribute. Should be list.' @@ -1057,6 +1089,83 @@ def test_ignore_attributes_dataset(self): paper_url=paper_url ) + def test_publish_fetch_ignore_attribute(self): + """Test to upload and retrieve dataset and check ignore_attributes""" + data = [ + ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'], + ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'], + ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'], + ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'], + ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'] + ] + column_names = ['rnd_str', 'outlook', 'temperature', 'humidity', + 'windy', 'play'] + df = pd.DataFrame(data, columns=column_names) + # enforce the type of each column + df['outlook'] = df['outlook'].astype('category') + df['windy'] = df['windy'].astype('bool') + df['play'] = df['play'].astype('category') + # meta-information + name = '%s-pandas_testing_dataset' % self._get_sentinel() + description = 'Synthetic dataset created from a Pandas DataFrame' + creator = 'OpenML tester' + collection_date = '01-01-2018' + language = 'English' + licence = 'MIT' + default_target_attribute = 'play' + citation = 'None' + original_data_url = 'http://openml.github.io/openml-python' + paper_url = 'http://openml.github.io/openml-python' + + # pass a list to ignore_attribute + ignore_attribute = ['outlook', 'windy'] + dataset = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=None, + ignore_attribute=ignore_attribute, + citation=citation, + attributes='auto', + data=df, + version_label='test', + original_data_url=original_data_url, + paper_url=paper_url + ) + + # publish dataset + upload_did = dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) + # test if publish was successful + self.assertIsInstance(upload_did, int) + + dataset = None + # fetching from server + # loop till timeout or fetch not successful + max_waiting_time_seconds = 400 + # time.time() works in seconds + start_time = time.time() + while time.time() - start_time < max_waiting_time_seconds: + try: + dataset = openml.datasets.get_dataset(upload_did) + break + except Exception as e: + # returned code 273: Dataset not processed yet + # returned code 362: No qualities found + print("Failed to fetch dataset:{} with '{}'.".format(upload_did, str(e))) + time.sleep(10) + continue + if dataset is None: + raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(upload_did)) + self.assertEqual(dataset.ignore_attribute, ignore_attribute) + def test_create_dataset_row_id_attribute_error(self): # meta-information name = '%s-pandas_testing_dataset' % self._get_sentinel() @@ -1146,6 +1255,9 @@ def test_create_dataset_row_id_attribute_inference(self): ) self.assertEqual(dataset.row_id_attribute, output_row_id) upload_did = dataset.publish() + TestBase._mark_entity_for_removal('data', upload_did) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + upload_did)) arff_dataset = arff.loads(_get_online_dataset_arff(upload_did)) arff_data = np.array(arff_dataset['data'], dtype=object) # if we set the name of the index then the index will be added to @@ -1190,3 +1302,8 @@ def test_create_dataset_attributes_auto_without_df(self): original_data_url=original_data_url, paper_url=paper_url ) + + def test_list_qualities(self): + qualities = openml.datasets.list_qualities() + self.assertEqual(isinstance(qualities, list), True) + self.assertEqual(all([isinstance(q, str) for q in qualities]), True) diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py index 37e8f710d..b25b35391 100644 --- a/tests/test_evaluations/test_evaluation_functions.py +++ b/tests/test_evaluations/test_evaluation_functions.py @@ -6,6 +6,30 @@ class TestEvaluationFunctions(TestBase): _multiprocess_can_split_ = True + def _check_list_evaluation_setups(self, size, **kwargs): + evals_setups = openml.evaluations.list_evaluations_setups("predictive_accuracy", + **kwargs, size=size, + sort_order='desc', + output_format='dataframe') + evals = openml.evaluations.list_evaluations("predictive_accuracy", + **kwargs, size=size, + sort_order='desc', + output_format='dataframe') + + # Check if list is non-empty + self.assertGreater(len(evals_setups), 0) + # Check if output from sort is sorted in the right order + self.assertSequenceEqual(sorted(evals_setups['value'].tolist(), reverse=True), + evals_setups['value'].tolist()) + + # Check if output and order of list_evaluations is preserved + self.assertSequenceEqual(evals_setups['run_id'].tolist(), evals['run_id'].tolist()) + # Check if the hyper-parameter column is as accurate and flow_id + for index, row in evals_setups.iterrows(): + params = openml.runs.get_run(row['run_id']).parameter_settings + hyper_params = [tuple([param['oml:name'], param['oml:value']]) for param in params] + self.assertTrue(sorted(row['parameters']) == sorted(hyper_params)) + def test_evaluation_list_filter_task(self): openml.config.server = self.production_server @@ -116,3 +140,41 @@ def test_evaluation_list_per_fold(self): for run_id in evaluations.keys(): self.assertIsNotNone(evaluations[run_id].value) self.assertIsNone(evaluations[run_id].values) + + def test_evaluation_list_sort(self): + size = 10 + task_id = 115 + # Get all evaluations of the task + unsorted_eval = openml.evaluations.list_evaluations( + "predictive_accuracy", offset=0, task=[task_id]) + # Get top 10 evaluations of the same task + sorted_eval = openml.evaluations.list_evaluations( + "predictive_accuracy", size=size, offset=0, task=[task_id], sort_order="desc") + self.assertEqual(len(sorted_eval), size) + self.assertGreater(len(unsorted_eval), 0) + sorted_output = [evaluation.value for evaluation in sorted_eval.values()] + unsorted_output = [evaluation.value for evaluation in unsorted_eval.values()] + + # Check if output from sort is sorted in the right order + self.assertTrue(sorted(sorted_output, reverse=True) == sorted_output) + + # Compare manual sorting against sorted output + test_output = sorted(unsorted_output, reverse=True) + self.assertTrue(test_output[:size] == sorted_output) + + def test_list_evaluation_measures(self): + measures = openml.evaluations.list_evaluation_measures() + self.assertEqual(isinstance(measures, list), True) + self.assertEqual(all([isinstance(s, str) for s in measures]), True) + + def test_list_evaluations_setups_filter_flow(self): + openml.config.server = self.production_server + flow_id = [405] + size = 100 + self._check_list_evaluation_setups(size, flow=flow_id) + + def test_list_evaluations_setups_filter_task(self): + openml.config.server = self.production_server + task_id = [6] + size = 100 + self._check_list_evaluation_setups(size, task=task_id) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index aef064ad5..8bc615516 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -1,5 +1,6 @@ import collections import json +import re import os import sys import unittest @@ -27,10 +28,6 @@ import sklearn.tree import sklearn.cluster -if LooseVersion(sklearn.__version__) < "0.20": - from sklearn.preprocessing import Imputer -else: - from sklearn.impute import SimpleImputer as Imputer import openml from openml.extensions.sklearn import SklearnExtension @@ -38,7 +35,8 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer + this_directory = os.path.dirname(os.path.abspath(__file__)) sys.path.append(this_directory) @@ -76,6 +74,7 @@ def test_serialize_model(self): max_leaf_nodes=2000) fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier' + fixture_short_name = 'sklearn.DecisionTreeClassifier' fixture_description = 'Automatically created scikit-learn flow.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ @@ -117,6 +116,7 @@ def test_serialize_model(self): self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters, fixture_parameters) self.assertEqual(serialization.dependencies, version_fixture) @@ -142,6 +142,7 @@ def test_serialize_model_clustering(self): model = sklearn.cluster.KMeans() fixture_name = 'sklearn.cluster.k_means_.KMeans' + fixture_short_name = 'sklearn.KMeans' fixture_description = 'Automatically created scikit-learn flow.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ @@ -179,6 +180,7 @@ def test_serialize_model_clustering(self): self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters, fixture_parameters) self.assertEqual(serialization.dependencies, version_fixture) @@ -204,6 +206,7 @@ def test_serialize_model_with_subcomponent(self): fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \ '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)' fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' + fixture_short_name = 'sklearn.AdaBoostClassifier' fixture_description = 'Automatically created scikit-learn flow.' fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier' @@ -218,6 +221,7 @@ def test_serialize_model_with_subcomponent(self): self.assertEqual(serialization.name, fixture_name) self.assertEqual(serialization.class_name, fixture_class_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"') self.assertIsInstance(serialization.parameters['base_estimator'], str) @@ -259,6 +263,7 @@ def test_serialize_pipeline(self): fixture_name = 'sklearn.pipeline.Pipeline(' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'dummy=sklearn.dummy.DummyClassifier)' + fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)' fixture_description = 'Automatically created scikit-learn flow.' fixture_structure = { fixture_name: [], @@ -270,17 +275,21 @@ def test_serialize_pipeline(self): structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) # Comparing the pipeline # The parameters only have the name of base objects(not the whole flow) # as value - # memory parameter has been added in 0.19 + # memory parameter has been added in 0.19, verbose in 0.21 if LooseVersion(sklearn.__version__) < "0.19": self.assertEqual(len(serialization.parameters), 1) - else: + elif LooseVersion(sklearn.__version__) < "0.21": self.assertEqual(len(serialization.parameters), 2) + else: + self.assertEqual(len(serialization.parameters), 3) + # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual( @@ -343,6 +352,7 @@ def test_serialize_pipeline_clustering(self): fixture_name = 'sklearn.pipeline.Pipeline(' \ 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'clusterer=sklearn.cluster.k_means_.KMeans)' + fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)' fixture_description = 'Automatically created scikit-learn flow.' fixture_structure = { fixture_name: [], @@ -354,6 +364,7 @@ def test_serialize_pipeline_clustering(self): structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture_name) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) @@ -363,8 +374,10 @@ def test_serialize_pipeline_clustering(self): # memory parameter has been added in 0.19 if LooseVersion(sklearn.__version__) < "0.19": self.assertEqual(len(serialization.parameters), 1) - else: + elif LooseVersion(sklearn.__version__) < "0.21": self.assertEqual(len(serialization.parameters), 2) + else: + self.assertEqual(len(serialization.parameters), 3) # Hard to compare two representations of a dict due to possibly # different sorting. Making a json makes it easier self.assertEqual( @@ -431,6 +444,7 @@ def test_serialize_column_transformer(self): fixture = 'sklearn.compose._column_transformer.ColumnTransformer(' \ 'numeric=sklearn.preprocessing.data.StandardScaler,' \ 'nominal=sklearn.preprocessing._encoders.OneHotEncoder)' + fixture_short_name = 'sklearn.ColumnTransformer' fixture_description = 'Automatically created scikit-learn flow.' fixture_structure = { fixture: [], @@ -441,6 +455,7 @@ def test_serialize_column_transformer(self): serialization = self.extension.model_to_flow(model) structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture) + self.assertEqual(serialization.custom_name, fixture_short_name) self.assertEqual(serialization.description, fixture_description) self.assertDictEqual(structure, fixture_structure) # del serialization.model @@ -611,7 +626,7 @@ def test_serialize_feature_union_switched_names(self): .format(module_name_encoder)) def test_serialize_complex_flow(self): - ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0]) + ohe = sklearn.preprocessing.OneHotEncoder() scaler = sklearn.preprocessing.StandardScaler(with_mean=False) boosting = sklearn.ensemble.AdaBoostClassifier( base_estimator=sklearn.tree.DecisionTreeClassifier()) @@ -734,15 +749,16 @@ def test_serialize_simple_parameter_grid(self): # Examples from the scikit-learn documentation models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()] grids = \ - [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, - {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], - 'kernel': ['rbf']}], - {"max_depth": [3, None], - "max_features": [1, 3, 10], - "min_samples_split": [1, 3, 10], - "min_samples_leaf": [1, 3, 10], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"]}] + [[OrderedDict([('C', [1, 10, 100, 1000]), ('kernel', ['linear'])]), + OrderedDict([('C', [1, 10, 100, 1000]), ('gamma', [0.001, 0.0001]), + ('kernel', ['rbf'])])], + OrderedDict([("bootstrap", [True, False]), + ("criterion", ["gini", "entropy"]), + ("max_depth", [3, None]), + ("max_features", [1, 3, 10]), + ("min_samples_leaf", [1, 3, 10]), + ("min_samples_split", [1, 3, 10]) + ])] for grid, model in zip(grids, models): serialized = self.extension.model_to_flow(grid) @@ -750,9 +766,9 @@ def test_serialize_simple_parameter_grid(self): self.assertEqual(deserialized, grid) self.assertIsNot(deserialized, grid) - + # providing error_score because nan != nan hpo = sklearn.model_selection.GridSearchCV( - param_grid=grid, estimator=model) + param_grid=grid, estimator=model, error_score=-1000) serialized = self.extension.model_to_flow(hpo) deserialized = self.extension.flow_to_model(serialized) @@ -825,7 +841,8 @@ def test_serialize_advanced_grid_fails(self): ) with self.assertRaisesRegex( TypeError, - ".*OpenMLFlow.*is not JSON serializable", + re.compile(r".*OpenML.*Flow.*is not JSON serializable", + flags=re.DOTALL) ): self.extension.model_to_flow(clf) @@ -929,7 +946,7 @@ def test_illegal_parameter_names(self): def test_illegal_parameter_names_pipeline(self): # illegal name: steps steps = [ - ('Imputer', Imputer(strategy='median')), + ('Imputer', SimpleImputer(strategy='median')), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')), @@ -942,7 +959,7 @@ def test_illegal_parameter_names_featureunion(self): # illegal name: transformer_list transformer_list = [ ('transformer_list', - Imputer(strategy='median')), + SimpleImputer(strategy='median')), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')) @@ -1001,18 +1018,25 @@ def test_paralizable_check(self): self.extension._prevent_optimize_n_jobs(model) def test__get_fn_arguments_with_defaults(self): - if LooseVersion(sklearn.__version__) < "0.19": + sklearn_version = LooseVersion(sklearn.__version__) + if sklearn_version < "0.19": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 15), (sklearn.tree.DecisionTreeClassifier.__init__, 12), (sklearn.pipeline.Pipeline.__init__, 0) ] - else: + elif sklearn_version < "0.21": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 16), (sklearn.tree.DecisionTreeClassifier.__init__, 13), (sklearn.pipeline.Pipeline.__init__, 1) ] + else: + fns = [ + (sklearn.ensemble.RandomForestRegressor.__init__, 16), + (sklearn.tree.DecisionTreeClassifier.__init__, 13), + (sklearn.pipeline.Pipeline.__init__, 2) + ] for fn, num_params_with_defaults in fns: defaults, defaultless = ( @@ -1033,7 +1057,7 @@ def test_deserialize_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter # settings. - steps = [('Imputer', Imputer()), + steps = [('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ('Estimator', sklearn.tree.DecisionTreeClassifier())] pipe_orig = sklearn.pipeline.Pipeline(steps=steps) @@ -1057,7 +1081,7 @@ def test_deserialize_adaboost_with_defaults(self): # used the 'initialize_with_defaults' flag of the deserialization # method to return a flow that contains default hyperparameter # settings. - steps = [('Imputer', Imputer()), + steps = [('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ('Estimator', sklearn.ensemble.AdaBoostClassifier( sklearn.tree.DecisionTreeClassifier()))] @@ -1083,7 +1107,7 @@ def test_deserialize_complex_with_defaults(self): # method to return a flow that contains default hyperparameter # settings. steps = [ - ('Imputer', Imputer()), + ('Imputer', SimpleImputer()), ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()), ( 'Estimator', @@ -1126,6 +1150,8 @@ def test_openml_param_name_to_sklearn(self): task = openml.tasks.get_task(115) run = openml.runs.run_flow_on_task(flow, task) run = run.publish() + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id)) run = openml.runs.get_run(run.run_id) setup = openml.setups.get_setup(run.setup_id) @@ -1217,6 +1243,14 @@ def setUp(self): ################################################################################################ # Test methods for performing runs with this extension module + def test_run_model_on_task(self): + class MyPipe(sklearn.pipeline.Pipeline): + pass + task = openml.tasks.get_task(1) + pipe = MyPipe([('imp', SimpleImputer()), + ('dummy', sklearn.dummy.DummyClassifier())]) + openml.runs.run_model_on_task(pipe, task) + def test_seed_model(self): # randomized models that are initialized without seeds, can be seeded randomized_clfs = [ @@ -1285,7 +1319,7 @@ def test_run_model_on_fold_classification_1(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeClassifier()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1411,11 +1445,11 @@ def predict_proba(*args, **kwargs): y_train = y[train_indices] X_test = X[test_indices] clf1 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', sklearn.naive_bayes.GaussianNB()) ]) clf2 = sklearn.pipeline.Pipeline(steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', HardNaiveBayes()) ]) @@ -1468,7 +1502,7 @@ def test_run_model_on_fold_regression(self): y_test = y[test_indices] pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.tree.DecisionTreeRegressor()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1513,7 +1547,7 @@ def test_run_model_on_fold_clustering(self): X = task.get_X(dataset_format='array') pipeline = sklearn.pipeline.Pipeline(steps=[ - ('imp', sklearn.preprocessing.Imputer()), + ('imp', SimpleImputer()), ('clf', sklearn.cluster.KMeans()), ]) # TODO add some mocking here to actually test the innards of this function, too! @@ -1596,3 +1630,62 @@ def test__extract_trace_data(self): self.assertIn(param_in_trace, trace_iteration.parameters) param_value = json.loads(trace_iteration.parameters[param_in_trace]) self.assertTrue(param_value in param_grid[param]) + + def test_trim_flow_name(self): + import re + long = """sklearn.pipeline.Pipeline( + columntransformer=sklearn.compose._column_transformer.ColumnTransformer( + numeric=sklearn.pipeline.Pipeline( + SimpleImputer=sklearn.preprocessing.imputation.Imputer, + standardscaler=sklearn.preprocessing.data.StandardScaler), + nominal=sklearn.pipeline.Pipeline( + simpleimputer=sklearn.impute.SimpleImputer, + onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), + variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + svc=sklearn.svm.classes.SVC)""" + short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)" + shorter = "sklearn.Pipeline(...,SVC)" + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + self.assertEqual(shorter, + SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50)) + + long = """sklearn.pipeline.Pipeline( + imputation=openmlstudy14.preprocessing.ConditionalImputer, + hotencoding=sklearn.preprocessing.data.OneHotEncoder, + variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + classifier=sklearn.ensemble.forest.RandomForestClassifier)""" + short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)" # noqa: E501 + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = """sklearn.pipeline.Pipeline( + SimpleImputer=sklearn.preprocessing.imputation.Imputer, + VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501 + Estimator=sklearn.model_selection._search.RandomizedSearchCV( + estimator=sklearn.tree.tree.DecisionTreeClassifier))""" + short = "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))" # noqa: E501 + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = """sklearn.model_selection._search.RandomizedSearchCV( + estimator=sklearn.pipeline.Pipeline( + SimpleImputer=sklearn.preprocessing.imputation.Imputer, + classifier=sklearn.ensemble.forest.RandomForestClassifier))""" + short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))" + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = """sklearn.pipeline.FeatureUnion( + pca=sklearn.decomposition.pca.PCA, + svd=sklearn.decomposition.truncated_svd.TruncatedSVD)""" + short = "sklearn.FeatureUnion(PCA,TruncatedSVD)" + long_stripped, _ = re.subn(r'\s', '', long) + self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped)) + + long = "sklearn.ensemble.forest.RandomForestClassifier" + short = "sklearn.RandomForestClassifier" + self.assertEqual(short, SklearnExtension.trim_flow_name(long)) + + self.assertEqual("weka.IsolationForest", + SklearnExtension.trim_flow_name("weka.IsolationForest")) diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index 7b8c66cab..25e2dacfb 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -19,18 +19,13 @@ import sklearn.naive_bayes import sklearn.tree -if LooseVersion(sklearn.__version__) < "0.20": - from sklearn.preprocessing import Imputer -else: - from sklearn.impute import SimpleImputer as Imputer - import xmltodict import openml from openml._api_calls import _perform_api_call import openml.exceptions import openml.extensions.sklearn -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer import openml.utils @@ -41,6 +36,9 @@ def setUp(self): super().setUp() self.extension = openml.extensions.sklearn.SklearnExtension() + def tearDown(self): + super().tearDown() + def test_get_flow(self): # We need to use the production server here because 4024 is not the # test server @@ -177,6 +175,9 @@ def test_publish_flow(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow.flow_id)) self.assertIsInstance(flow.flow_id, int) @mock.patch('openml.flows.functions.flow_exists') @@ -187,6 +188,9 @@ def test_publish_existing_flow(self, flow_exists_mock): with self.assertRaises(openml.exceptions.PyOpenMLError) as context_manager: flow.publish(raise_error_if_exists=True) + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow.flow_id)) self.assertTrue('OpenMLFlow already exists' in context_manager.exception.message) @@ -197,6 +201,9 @@ def test_publish_flow_with_similar_components(self): flow = self.extension.model_to_flow(clf) flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow.flow_id)) # For a flow where both components are published together, the upload # date should be equal self.assertEqual( @@ -213,6 +220,9 @@ def test_publish_flow_with_similar_components(self): flow1 = self.extension.model_to_flow(clf1) flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None) flow1.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow1.flow_id)) # In order to assign different upload times to the flows! time.sleep(1) @@ -222,6 +232,9 @@ def test_publish_flow_with_similar_components(self): flow2 = self.extension.model_to_flow(clf2) flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel) flow2.publish() + TestBase._mark_entity_for_removal('flow', (flow2.flow_id, flow2.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow2.flow_id)) # If one component was published before the other, the components in # the flow should have different upload dates self.assertNotEqual(flow2.upload_date, @@ -234,6 +247,9 @@ def test_publish_flow_with_similar_components(self): # Child flow has different parameter. Check for storing the flow # correctly on the server should thus not check the child's parameters! flow3.publish() + TestBase._mark_entity_for_removal('flow', (flow3.flow_id, flow3.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow3.flow_id)) def test_semi_legal_flow(self): # TODO: Test if parameters are set correctly! @@ -246,6 +262,9 @@ def test_semi_legal_flow(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow.flow_id)) @mock.patch('openml.flows.functions.get_flow') @mock.patch('openml.flows.functions.flow_exists') @@ -260,6 +279,8 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): get_flow_mock.return_value = flow flow.publish() + # Not collecting flow_id for deletion since this is a test for failed upload + self.assertEqual(api_call_mock.call_count, 1) self.assertEqual(get_flow_mock.call_count, 1) self.assertEqual(flow_exists_mock.call_count, 1) @@ -271,10 +292,13 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock): with self.assertRaises(ValueError) as context_manager: flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow.flow_id)) fixture = ( - "Flow was not stored correctly on the server. " - "New flow ID is 1. Please check manually and remove " + "The flow on the server is inconsistent with the local flow. " + "The server flow ID is 1. Please check manually and remove " "the flow if necessary! Error is:\n" "'Flow sklearn.ensemble.forest.RandomForestClassifier: " "values for attribute 'name' differ: " @@ -289,8 +313,8 @@ def test_illegal_flow(self): # should throw error as it contains two imputers illegal = sklearn.pipeline.Pipeline( steps=[ - ('imputer1', Imputer()), - ('imputer2', Imputer()), + ('imputer1', SimpleImputer()), + ('imputer2', SimpleImputer()), ('classif', sklearn.tree.DecisionTreeClassifier()) ] ) @@ -321,7 +345,7 @@ def test_existing_flow_exists(self): if LooseVersion(sklearn.__version__) >= '0.20': ohe_params['categories'] = 'auto' steps = [ - ('imputation', Imputer(strategy='median')), + ('imputation', SimpleImputer(strategy='median')), ('hotencoding', sklearn.preprocessing.OneHotEncoder(**ohe_params)), ( 'variencethreshold', @@ -336,6 +360,9 @@ def test_existing_flow_exists(self): flow, _ = self._add_sentinel_to_flow_name(flow, None) # publish the flow flow = flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow.flow_id)) # redownload the flow flow = openml.flows.get_flow(flow.flow_id) @@ -394,6 +421,9 @@ def test_sklearn_to_upload_to_flow(self): flow, sentinel = self._add_sentinel_to_flow_name(flow, None) flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + flow.flow_id)) self.assertIsInstance(flow.flow_id, int) # Check whether we can load the flow again diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 087623d3d..95b4fa3f0 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -4,6 +4,7 @@ from distutils.version import LooseVersion import sklearn +from sklearn import ensemble import pandas as pd import openml @@ -14,6 +15,12 @@ class TestFlowFunctions(TestBase): _multiprocess_can_split_ = True + def setUp(self): + super(TestFlowFunctions, self).setUp() + + def tearDown(self): + super(TestFlowFunctions, self).tearDown() + def _check_flow(self, flow): self.assertEqual(type(flow), dict) self.assertEqual(len(flow), 6) @@ -242,7 +249,6 @@ def test_are_flows_equal_ignore_if_older(self): def test_sklearn_to_flow_list_of_lists(self): from sklearn.preprocessing import OrdinalEncoder ordinal_encoder = OrdinalEncoder(categories=[[0, 1], [0, 1]]) - extension = openml.extensions.sklearn.SklearnExtension() # Test serialization works @@ -251,8 +257,42 @@ def test_sklearn_to_flow_list_of_lists(self): # Test flow is accepted by server self._add_sentinel_to_flow_name(flow) flow.publish() - + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id)) # Test deserialization works server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) self.assertEqual(server_flow.parameters['categories'], '[[0, 1], [0, 1]]') self.assertEqual(server_flow.model.categories, flow.model.categories) + + def test_get_flow_reinstantiate_model(self): + model = ensemble.RandomForestClassifier(n_estimators=33) + extension = openml.extensions.get_extension_by_model(model) + flow = extension.model_to_flow(model) + flow.publish(raise_error_if_exists=False) + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id)) + + downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) + self.assertIsInstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier) + + def test_get_flow_reinstantiate_model_no_extension(self): + # Flow 10 is a WEKA flow + self.assertRaisesRegex(RuntimeError, + "No extension could be found for flow 10: weka.SMO", + openml.flows.get_flow, + flow_id=10, + reinstantiate=True) + + @unittest.skipIf(LooseVersion(sklearn.__version__) == "0.19.1", + reason="Target flow is from sklearn 0.19.1") + def test_get_flow_reinstantiate_model_wrong_version(self): + # Note that CI does not test against 0.19.1. + openml.config.server = self.production_server + _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3] + flow = 8175 + expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.' + self.assertRaisesRegex(ValueError, + expected, + openml.flows.get_flow, + flow_id=flow, + reinstantiate=True) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index bba14b324..88fe8d6ef 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -7,12 +7,13 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline -from sklearn.preprocessing import Imputer -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer import openml import openml.extensions.sklearn +import pytest + class TestRun(TestBase): # Splitting not helpful, these test's don't rely on the server and take @@ -104,7 +105,7 @@ def _check_array(array, type_): def test_to_from_filesystem_vanilla(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DecisionTreeClassifier(max_depth=1)), ]) task = openml.tasks.get_task(119) @@ -129,11 +130,15 @@ def test_to_from_filesystem_vanilla(self): self.assertTrue(run_prime.flow is None) self._test_run_obj_equals(run, run_prime) run_prime.publish() + TestBase._mark_entity_for_removal('run', run_prime.run_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + run_prime.run_id)) + @pytest.mark.flaky(reruns=3) def test_to_from_filesystem_search(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DecisionTreeClassifier(max_depth=1)), ]) model = GridSearchCV( @@ -162,11 +167,14 @@ def test_to_from_filesystem_search(self): run_prime = openml.runs.OpenMLRun.from_filesystem(cache_path) self._test_run_obj_equals(run, run_prime) run_prime.publish() + TestBase._mark_entity_for_removal('run', run_prime.run_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + run_prime.run_id)) def test_to_from_filesystem_no_model(self): model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DummyClassifier()), ]) task = openml.tasks.get_task(119) @@ -196,7 +204,7 @@ def test_publish_with_local_loaded_flow(self): extension = openml.extensions.sklearn.SklearnExtension() model = Pipeline([ - ('imputer', Imputer(strategy='mean')), + ('imputer', SimpleImputer(strategy='mean')), ('classifier', DummyClassifier()), ]) task = openml.tasks.get_task(119) @@ -226,6 +234,9 @@ def test_publish_with_local_loaded_flow(self): # obtain run from filesystem loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path) loaded_run.publish() + TestBase._mark_entity_for_removal('run', loaded_run.run_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + loaded_run.run_id)) # make sure the flow is published as part of publishing the run. self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version)) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 0c8b861c4..dc35d1f01 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -17,7 +17,7 @@ import pandas as pd import openml.extensions.sklearn -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer from openml.runs.functions import ( _run_task_get_arffcontent, run_exists, @@ -28,7 +28,7 @@ from sklearn.naive_bayes import GaussianNB from sklearn.model_selection._search import BaseSearchCV from sklearn.tree import DecisionTreeClassifier -from sklearn.preprocessing.imputation import Imputer + from sklearn.dummy import DummyClassifier from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import VarianceThreshold @@ -184,6 +184,8 @@ def _remove_random_state(flow): flow, _ = self._add_sentinel_to_flow_name(flow, sentinel) if not openml.flows.flow_exists(flow.name, flow.external_version): flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id)) task = openml.tasks.get_task(task_id) @@ -196,6 +198,8 @@ def _remove_random_state(flow): avoid_duplicate_runs=openml.config.avoid_duplicate_runs, ) run_ = run.publish() + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) self.assertEqual(run_, run) self.assertIsInstance(run.dataset_id, int) @@ -407,7 +411,7 @@ def determine_grid_size(param_grid): # suboptimal (slow), and not guaranteed to work if evaluation # engine is behind. # TODO: mock this? We have the arff already on the server - self._wait_for_processed_run(run.run_id, 200) + self._wait_for_processed_run(run.run_id, 400) try: model_prime = openml.runs.initialize_model_from_trace( run_id=run.run_id, @@ -546,7 +550,7 @@ def get_ct_cf(nominal_indices, numeric_indices): '62501', sentinel=sentinel) def test_run_and_upload_decision_tree_pipeline(self): - pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV( DecisionTreeClassifier(), @@ -653,7 +657,7 @@ def test_learning_curve_task_2(self): num_folds = 10 num_samples = 8 - pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV( DecisionTreeClassifier(), @@ -687,6 +691,8 @@ def test_initialize_cv_from_run(self): seed=1, ) run_ = run.publish() + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) run = openml.runs.get_run(run_.run_id) modelR = openml.runs.initialize_model_from_run(run_id=run.run_id) @@ -708,9 +714,9 @@ def _test_local_evaluations(self, run): np.testing.assert_array_almost_equal(accuracy_scores_provided, accuracy_scores) - # also check if we can obtain some other scores: # TODO: how to do AUC? + # also check if we can obtain some other scores: tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}), - (sklearn.metrics.auc, {'reorder': True}), + (sklearn.metrics.roc_auc_score, {}), (sklearn.metrics.average_precision_score, {}), (sklearn.metrics.jaccard_similarity_score, {}), (sklearn.metrics.precision_score, {'average': 'macro'}), @@ -725,10 +731,10 @@ def _test_local_evaluations(self, run): self.assertGreaterEqual(alt_scores[idx], 0) self.assertLessEqual(alt_scores[idx], 1) - def test_local_run_metric_score_swapped_parameter_order_model(self): + def test_local_run_swapped_parameter_order_model(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) # download task @@ -736,18 +742,17 @@ def test_local_run_metric_score_swapped_parameter_order_model(self): # invoke OpenML run run = openml.runs.run_model_on_task( - model=clf, - task=task, + task, clf, avoid_duplicate_runs=False, upload_flow=False, ) self._test_local_evaluations(run) - def test_local_run_metric_score_swapped_parameter_order_flow(self): + def test_local_run_swapped_parameter_order_flow(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) flow = self.extension.model_to_flow(clf) @@ -756,8 +761,7 @@ def test_local_run_metric_score_swapped_parameter_order_flow(self): # invoke OpenML run run = openml.runs.run_flow_on_task( - flow=flow, - task=task, + task, flow, avoid_duplicate_runs=False, upload_flow=False, ) @@ -767,7 +771,7 @@ def test_local_run_metric_score_swapped_parameter_order_flow(self): def test_local_run_metric_score(self): # construct sci-kit learn classifier - clf = Pipeline(steps=[('imputer', Imputer(strategy='median')), + clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('estimator', RandomForestClassifier())]) # download task @@ -794,7 +798,7 @@ def test_online_run_metric_score(self): def test_initialize_model_from_run(self): clf = sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='median')), + ('Imputer', SimpleImputer(strategy='median')), ('VarianceThreshold', VarianceThreshold(threshold=0.05)), ('Estimator', GaussianNB())]) task = openml.tasks.get_task(11) @@ -804,6 +808,8 @@ def test_initialize_model_from_run(self): avoid_duplicate_runs=False, ) run_ = run.publish() + TestBase._mark_entity_for_removal('run', run_.run_id) + TestBase.logger.info("collected from test_run_functions: {}".format(run_.run_id)) run = openml.runs.get_run(run_.run_id) modelR = openml.runs.initialize_model_from_run(run_id=run.run_id) @@ -855,6 +861,8 @@ def test_get_run_trace(self): num_iterations * num_folds, ) run = run.publish() + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) self._wait_for_processed_run(run.run_id, 200) run_id = run.run_id except openml.exceptions.OpenMLRunsExistError as e: @@ -874,12 +882,12 @@ def test__run_exists(self): rs = 1 clfs = [ sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='mean')), + ('Imputer', SimpleImputer(strategy='mean')), ('VarianceThreshold', VarianceThreshold(threshold=0.05)), ('Estimator', DecisionTreeClassifier(max_depth=4)) ]), sklearn.pipeline.Pipeline(steps=[ - ('Imputer', Imputer(strategy='most_frequent')), + ('Imputer', SimpleImputer(strategy='most_frequent')), ('VarianceThreshold', VarianceThreshold(threshold=0.1)), ('Estimator', DecisionTreeClassifier(max_depth=4))] ) @@ -899,6 +907,8 @@ def test__run_exists(self): upload_flow=True ) run.publish() + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) except openml.exceptions.PyOpenMLError: # run already existed. Great. pass @@ -959,6 +969,8 @@ def test_run_with_illegal_flow_id_after_load(self): "but 'flow.flow_id' is not None.") with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex): loaded_run.publish() + TestBase._mark_entity_for_removal('run', loaded_run.run_id) + TestBase.logger.info("collected from test_run_functions: {}".format(loaded_run.run_id)) def test_run_with_illegal_flow_id_1(self): # Check the case where the user adds an illegal flow id to an existing @@ -968,6 +980,8 @@ def test_run_with_illegal_flow_id_1(self): flow_orig = self.extension.model_to_flow(clf) try: flow_orig.publish() # ensures flow exist on server + TestBase._mark_entity_for_removal('flow', (flow_orig.flow_id, flow_orig.name)) + TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id)) except openml.exceptions.OpenMLServerException: # flow already exists pass @@ -993,6 +1007,8 @@ def test_run_with_illegal_flow_id_1_after_load(self): flow_orig = self.extension.model_to_flow(clf) try: flow_orig.publish() # ensures flow exist on server + TestBase._mark_entity_for_removal('flow', (flow_orig.flow_id, flow_orig.name)) + TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id)) except openml.exceptions.OpenMLServerException: # flow already exists pass @@ -1235,7 +1251,7 @@ def test_run_on_dataset_with_missing_labels(self): flow.name = 'dummy' task = openml.tasks.get_task(2) - model = Pipeline(steps=[('Imputer', Imputer(strategy='median')), + model = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')), ('Estimator', DecisionTreeClassifier())]) data_content, _, _, _ = _run_task_get_arffcontent( @@ -1261,12 +1277,14 @@ def test_get_uncached_run(self): with self.assertRaises(openml.exceptions.OpenMLCacheException): openml.runs.functions._get_cached_run(10) - def test_run_model_on_task_downloaded_flow(self): + def test_run_flow_on_task_downloaded_flow(self): model = sklearn.ensemble.RandomForestClassifier(n_estimators=33) flow = self.extension.model_to_flow(model) flow.publish(raise_error_if_exists=False) + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id)) - downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) + downloaded_flow = openml.flows.get_flow(flow.flow_id) task = openml.tasks.get_task(119) # diabetes run = openml.runs.run_flow_on_task( flow=downloaded_flow, @@ -1276,3 +1294,5 @@ def test_run_model_on_task_downloaded_flow(self): ) run.publish() + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id)) diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index a8f7de4d4..16e149544 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -40,6 +40,8 @@ def test_nonexisting_setup_exists(self): flow = self.extension.model_to_flow(dectree) flow.name = 'TEST%s%s' % (sentinel, flow.name) flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id)) # although the flow exists (created as of previous statement), # we can be sure there are no setups (yet) as it was just created @@ -52,6 +54,8 @@ def _existing_setup_exists(self, classif): flow = self.extension.model_to_flow(classif) flow.name = 'TEST%s%s' % (get_sentinel(), flow.name) flow.publish() + TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name)) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id)) # although the flow exists, we can be sure there are no # setups (yet) as it hasn't been ran @@ -66,6 +70,8 @@ def _existing_setup_exists(self, classif): # spoof flow id, otherwise the sentinel is ignored run.flow_id = flow.flow_id run.publish() + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id)) # download the run, as it contains the right setup id run = openml.runs.get_run(run.run_id) diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index abee2d72a..1d9c56d54 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,4 +1,4 @@ -from openml.testing import TestBase +from openml.testing import TestBase, SimpleImputer class TestStudyFunctions(TestBase): @@ -30,12 +30,13 @@ def test_Figure1a(self): import sklearn.pipeline import sklearn.preprocessing import sklearn.tree + benchmark_suite = openml.study.get_study( 'OpenML100', 'tasks' ) # obtain the benchmark suite clf = sklearn.pipeline.Pipeline( steps=[ - ('imputer', sklearn.preprocessing.Imputer()), + ('imputer', SimpleImputer()), ('estimator', sklearn.tree.DecisionTreeClassifier()) ] ) # build a sklearn classifier @@ -51,4 +52,7 @@ def test_Figure1a(self): ) # print accuracy score print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name, score.mean())) run.publish() # publish the experiment on OpenML (optional) + TestBase._mark_entity_for_removal('run', run.run_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + run.run_id)) print('URL for run: %s/run/%d' % (openml.config.server, run.run_id)) diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index c87dd8e15..33ba0c452 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -77,6 +77,9 @@ def test_publish_benchmark_suite(self): task_ids=fixture_task_ids ) study_id = study.publish() + TestBase._mark_entity_for_removal('study', study_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id)) + self.assertGreater(study_id, 0) # verify main meta data @@ -132,6 +135,8 @@ def test_publish_study(self): run_ids=list(run_list.keys()) ) study_id = study.publish() + # not tracking upload for delete since _delete_entity called end of function + # asserting return status from openml.study.delete_study() self.assertGreater(study_id, 0) study_downloaded = openml.study.get_study(study_id) self.assertEqual(study_downloaded.alias, fixt_alias) @@ -181,6 +186,8 @@ def test_study_attach_illegal(self): run_ids=list(run_list.keys()) ) study_id = study.publish() + TestBase._mark_entity_for_removal('study', study_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id)) study_original = openml.study.get_study(study_id) with self.assertRaisesRegex(openml.exceptions.OpenMLServerException, diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py index 21e03052f..168b798d1 100644 --- a/tests/test_tasks/test_clustering_task.py +++ b/tests/test_tasks/test_clustering_task.py @@ -1,5 +1,7 @@ import openml +from openml.testing import TestBase from .test_task import OpenMLTaskTest +from openml.exceptions import OpenMLServerException class OpenMLClusteringTaskTest(OpenMLTaskTest): @@ -28,19 +30,31 @@ def test_download_task(self): self.assertEqual(task.dataset_id, 36) def test_upload_task(self): - - # The base class uploads a clustering task with a target - # feature. A situation where a ground truth is available - # to benchmark the clustering algorithm. - super(OpenMLClusteringTaskTest, self).test_upload_task() - - dataset_id = self._get_compatible_rand_dataset() - # Upload a clustering task without a ground truth. - task = openml.tasks.create_task( - task_type_id=self.task_type_id, - dataset_id=dataset_id, - estimation_procedure_id=self.estimation_procedure - ) - - task_id = task.publish() - openml.utils._delete_entity('task', task_id) + compatible_datasets = self._get_compatible_rand_dataset() + for i in range(100): + try: + dataset_id = compatible_datasets[i % len(compatible_datasets)] + # Upload a clustering task without a ground truth. + task = openml.tasks.create_task( + task_type_id=self.task_type_id, + dataset_id=dataset_id, + estimation_procedure_id=self.estimation_procedure + ) + task_id = task.publish() + TestBase._mark_entity_for_removal('task', task_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + task_id)) + # success + break + except OpenMLServerException as e: + # Error code for 'task already exists' + # Should be 533 according to the docs + # (# https://www.openml.org/api_docs#!/task/post_task) + if e.code == 614: + continue + else: + raise e + else: + raise ValueError( + 'Could not create a valid task for task type ID {}'.format(self.task_type_id) + ) diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py index 46c6564a1..763bb15f7 100644 --- a/tests/test_tasks/test_split.py +++ b/tests/test_tasks/test_split.py @@ -19,8 +19,7 @@ def setUp(self): self.directory, "..", "files", "org", "openml", "test", "tasks", "1882", "datasplits.arff" ) - # TODO Needs to be adapted regarding the python version - self.pd_filename = self.arff_filename.replace(".arff", ".pkl") + self.pd_filename = self.arff_filename.replace(".arff", ".pkl.py3") def tearDown(self): try: diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py index fe7fa5f0e..3066d9ce9 100644 --- a/tests/test_tasks/test_task.py +++ b/tests/test_tasks/test_task.py @@ -1,5 +1,6 @@ import unittest -from random import randint +from typing import List +from random import randint, shuffle from openml.exceptions import OpenMLServerException from openml.testing import TestBase @@ -11,9 +12,6 @@ create_task, get_task ) -from openml.utils import ( - _delete_entity, -) class OpenMLTaskTest(TestBase): @@ -47,9 +45,10 @@ def test_upload_task(self): # beforehand would not be an option because a concurrent unit test could potentially # create the same task and make this unit test fail (i.e. getting a dataset and creating # a task for it is not atomic). + compatible_datasets = self._get_compatible_rand_dataset() for i in range(100): try: - dataset_id = self._get_compatible_rand_dataset() + dataset_id = compatible_datasets[i % len(compatible_datasets)] # TODO consider implementing on the diff task types. task = create_task( task_type_id=self.task_type_id, @@ -59,6 +58,9 @@ def test_upload_task(self): ) task_id = task.publish() + TestBase._mark_entity_for_removal('task', task_id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], + task_id)) # success break except OpenMLServerException as e: @@ -74,9 +76,7 @@ def test_upload_task(self): 'Could not create a valid task for task type ID {}'.format(self.task_type_id) ) - _delete_entity('task', task_id) - - def _get_compatible_rand_dataset(self) -> int: + def _get_compatible_rand_dataset(self) -> List: compatible_datasets = [] active_datasets = list_datasets(status='active') @@ -84,22 +84,30 @@ def _get_compatible_rand_dataset(self) -> int: # depending on the task type, find either datasets # with only symbolic features or datasets with only # numerical features. - if self.task_type_id != 2: + if self.task_type_id == 2: + # regression task + for dataset_id, dataset_info in active_datasets.items(): + if 'NumberOfSymbolicFeatures' in dataset_info: + if dataset_info['NumberOfSymbolicFeatures'] == 0: + compatible_datasets.append(dataset_id) + elif self.task_type_id == 5: + # clustering task + compatible_datasets = list(active_datasets.keys()) + else: for dataset_id, dataset_info in active_datasets.items(): # extra checks because of: # https://github.com/openml/OpenML/issues/959 if 'NumberOfNumericFeatures' in dataset_info: if dataset_info['NumberOfNumericFeatures'] == 0: compatible_datasets.append(dataset_id) - else: - for dataset_id, dataset_info in active_datasets.items(): - if 'NumberOfSymbolicFeatures' in dataset_info: - if dataset_info['NumberOfSymbolicFeatures'] == 0: - compatible_datasets.append(dataset_id) - random_dataset_pos = randint(0, len(compatible_datasets) - 1) + # in-place shuffling + shuffle(compatible_datasets) + return compatible_datasets - return compatible_datasets[random_dataset_pos] + # random_dataset_pos = randint(0, len(compatible_datasets) - 1) + # + # return compatible_datasets[random_dataset_pos] def _get_random_feature(self, dataset_id: int) -> str: diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index ef3a454d8..f773752d5 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -12,6 +12,12 @@ class TestTask(TestBase): _multiprocess_can_split_ = True + def setUp(self): + super(TestTask, self).setUp() + + def tearDown(self): + super(TestTask, self).tearDown() + def test__get_cached_tasks(self): openml.config.cache_directory = self.static_cache_dir tasks = openml.tasks.functions._get_cached_tasks() @@ -78,6 +84,8 @@ def test_list_tasks_empty(self): self.assertIsInstance(tasks, dict) + @unittest.skip("Server will currently incorrectly return only 99 tasks." + "See https://github.com/openml/OpenML/issues/980") def test_list_tasks_by_tag(self): num_basic_tasks = 100 # number is flexible, check server if fails tasks = openml.tasks.list_tasks(tag='OpenML100') diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py index 55cbba64b..4a0789414 100644 --- a/tests/test_tasks/test_task_methods.py +++ b/tests/test_tasks/test_task_methods.py @@ -7,6 +7,12 @@ # Common methods between tasks class OpenMLTaskMethodsTest(TestBase): + def setUp(self): + super(OpenMLTaskMethodsTest, self).setUp() + + def tearDown(self): + super(OpenMLTaskMethodsTest, self).tearDown() + def test_tagging(self): task = openml.tasks.get_task(1) tag = "testing_tag_{}_{}".format(self.id(), time()) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 04f803f86..1f754c23a 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -43,12 +43,16 @@ def test_list_all_for_datasets(self): self._check_dataset(datasets[did]) def test_list_datasets_with_high_size_parameter(self): + # Testing on prod since concurrent deletion of uploded datasets make the test fail + openml.config.server = self.production_server + datasets_a = openml.datasets.list_datasets() datasets_b = openml.datasets.list_datasets(size=np.inf) - # note that in the meantime the number of datasets could have increased - # due to tests that run in parallel. - self.assertGreaterEqual(len(datasets_b), len(datasets_a)) + # Reverting to test server + openml.config.server = self.test_server + + self.assertEqual(len(datasets_a), len(datasets_b)) def test_list_all_for_tasks(self): required_size = 1068 # default test server reset value