Release 0.14 (#1266)

openml · Jul 4, 2023 · 2791074 · 2791074
1 parent 3380bbb
commit 2791074
Show file tree

Hide file tree

Showing 56 changed files with 1,031 additions and 601 deletions.
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
@@ -7,10 +7,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Setup Python 3.7
+    - name: Setup Python 3.8
       uses: actions/setup-python@v4
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install pre-commit
       run: |
         pip install pre-commit

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -53,6 +53,7 @@ jobs:
           - os: windows-latest
             sklearn-only: 'false'
             scikit-learn: 0.24.*
+            scipy: 1.10.0
       fail-fast:  false
       max-parallel: 4
 
@@ -113,5 +114,6 @@ jobs:
       uses: codecov/codecov-action@v3
       with:
         files: coverage.xml
+        token: ${{ secrets.CODECOV_TOKEN }}
         fail_ci_if_error: true
         verbose: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 22.6.0
+    rev: 23.3.0
     hooks:
       - id: black
         args: [--line-length=100]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.961
+    rev: v1.4.1
     hooks:
       - id: mypy
         name: mypy openml
@@ -19,8 +19,16 @@ repos:
         additional_dependencies:
           - types-requests
           - types-python-dateutil
+      - id: mypy
+        name: mypy top-level-functions
+        files: openml/_api_calls.py
+        additional_dependencies:
+          - types-requests
+          - types-python-dateutil
+        args: [ --disallow-untyped-defs, --disallow-any-generics,
+                --disallow-any-explicit, --implicit-optional ]
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 6.0.0
     hooks:
       - id: flake8
         name: flake8 openml

diff --git a/README.md b/README.md
@@ -20,15 +20,19 @@ following paper:
 
 [Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren, Frank Hutter<br/>
 **OpenML-Python: an extensible Python API for OpenML**<br/>
-*arXiv:1911.02490 [cs.LG]*](https://arxiv.org/abs/1911.02490)
+Journal of Machine Learning Research, 22(100):1−5, 2021](https://www.jmlr.org/papers/v22/19-920.html)
 
 Bibtex entry:
 ```bibtex
-@article{feurer-arxiv19a,
-  author    = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter},
-  title     = {OpenML-Python: an extensible Python API for OpenML},
-  journal   = {arXiv:1911.02490},
-  year      = {2019},
+@article{JMLR:v22:19-920,
+  author  = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter},
+  title   = {OpenML-Python: an extensible Python API for OpenML},
+  journal = {Journal of Machine Learning Research},
+  year    = {2021},
+  volume  = {22},
+  number  = {100},
+  pages   = {1--5},
+  url     = {http://jmlr.org/papers/v22/19-920.html}
 }
 ```
 

diff --git a/doc/index.rst b/doc/index.rst
@@ -30,7 +30,7 @@ Example
             ('estimator', tree.DecisionTreeClassifier())
         ]
     )
-    # Download the OpenML task for the german credit card dataset with 10-fold
+    # Download the OpenML task for the pendigits dataset with 10-fold
     # cross-validation.
     task = openml.tasks.get_task(32)
     # Run the scikit-learn model on the task.
@@ -93,17 +93,21 @@ Citing OpenML-Python
 If you use OpenML-Python in a scientific publication, we would appreciate a
 reference to the following paper:
 
-
- `OpenML-Python: an extensible Python API for OpenML
- <https://arxiv.org/abs/1911.02490>`_,
- Feurer *et al.*, arXiv:1911.02490.
+| Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren, Frank Hutter
+| **OpenML-Python: an extensible Python API for OpenML**
+| Journal of Machine Learning Research, 22(100):1−5, 2021
+| `https://www.jmlr.org/papers/v22/19-920.html <https://www.jmlr.org/papers/v22/19-920.html>`_
 
  Bibtex entry::
 
-     @article{feurer-arxiv19a,
-         author    = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas Müller and Joaquin Vanschoren and Frank Hutter},
-         title     = {OpenML-Python: an extensible Python API for OpenML},
-         journal   = {arXiv:1911.02490},
-         year      = {2019},
-     }
+    @article{JMLR:v22:19-920,
+        author  = {Matthias Feurer and Jan N. van Rijn and Arlind Kadra and Pieter Gijsbers and Neeratyoy Mallik and Sahithya Ravi and Andreas MÃ¼ller and Joaquin Vanschoren and Frank Hutter},
+        title   = {OpenML-Python: an extensible Python API for OpenML},
+        journal = {Journal of Machine Learning Research},
+        year    = {2021},
+        volume  = {22},
+        number  = {100},
+        pages   = {1--5},
+        url     = {http://jmlr.org/papers/v22/19-920.html}
+    }
 
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -6,22 +6,55 @@
 Changelog
 =========
 
+0.14.0
+~~~~~~
+
+**IMPORTANT:** This release paves the way towards a breaking update of OpenML-Python. From version
+0.15, functions that had the option to return a pandas DataFrame will return a pandas DataFrame
+by default. This version (0.14) emits a warning if you still use the old access functionality. 
+More concretely:
+
+* In 0.15 we will drop the ability to return dictionaries in listing calls and only provide
+  pandas DataFrames. To disable warnings in 0.14 you have to request a pandas DataFrame
+  (using ``output_format="dataframe"``).
+* In 0.15 we will drop the ability to return datasets as numpy arrays and only provide
+  pandas DataFrames. To disable warnings in 0.14 you have to request a pandas DataFrame 
+  (using ``dataset_format="dataframe"``).
+
+Furthermore, from version 0.15, OpenML-Python will no longer download datasets and dataset metadata
+by default. This version (0.14) emits a warning if you don't explicitly specifiy the desired behavior.
+
+Please see the pull requests #1258 and #1260 for further information.
+
+* ADD #1081: New flag that allows disabling downloading dataset features.
+* ADD #1132: New flag that forces a redownload of cached data.
+* FIX #1244: Fixes a rare bug where task listing could fail when the server returned invalid data.
+* DOC #1229: Fixes a comment string for the main example.
+* DOC #1241: Fixes a comment in an example.
+* MAINT #1124: Improve naming of helper functions that govern the cache directories.
+* MAINT #1223, #1250: Update tools used in pre-commit to the latest versions (``black==23.30``, ``mypy==1.3.0``, ``flake8==6.0.0``).
+* MAINT #1253: Update the citation request to the JMLR paper.
+* MAINT #1246: Add a warning that warns the user that checking for duplicate runs on the server cannot be done without an API key.
+
 0.13.1
 ~~~~~~
 
- * ADD #1028: Add functions to delete runs, flows, datasets, and tasks (e.g., ``openml.datasets.delete_dataset``).
- * ADD #1144: Add locally computed results to the ``OpenMLRun`` object's representation if the run was created locally and not downloaded from the server.
- * ADD #1180: Improve the error message when the checksum of a downloaded dataset does not match the checksum provided by the API.
- * ADD #1201: Make ``OpenMLTraceIteration`` a dataclass.
- * DOC #1069: Add argument documentation for the ``OpenMLRun`` class.
- * FIX #1197 #559 #1131: Fix the order of ground truth and predictions in the ``OpenMLRun`` object and in ``format_prediction``.
- * FIX #1198: Support numpy 1.24 and higher.
- * FIX #1216: Allow unknown task types on the server. This is only relevant when new task types are added to the test server.
- * MAINT #1155: Add dependabot github action to automatically update other github actions.
- * MAINT #1199: Obtain pre-commit's flake8 from github.com instead of gitlab.com.
- * MAINT #1215: Support latest numpy version.
- * MAINT #1218: Test Python3.6 on Ubuntu 20.04 instead of the latest Ubuntu (which is 22.04).
- * MAINT #1221 #1212 #1206 #1211: Update github actions to the latest versions.
+* ADD #1081 #1132: Add additional options for (not) downloading datasets ``openml.datasets.get_dataset`` and cache management.
+* ADD #1028: Add functions to delete runs, flows, datasets, and tasks (e.g., ``openml.datasets.delete_dataset``).
+* ADD #1144: Add locally computed results to the ``OpenMLRun`` object's representation if the run was created locally and not downloaded from the server.
+* ADD #1180: Improve the error message when the checksum of a downloaded dataset does not match the checksum provided by the API.
+* ADD #1201: Make ``OpenMLTraceIteration`` a dataclass.
+* DOC #1069: Add argument documentation for the ``OpenMLRun`` class.
+* DOC #1241 #1229 #1231: Minor documentation fixes and resolve documentation examples not working.
+* FIX #1197 #559 #1131: Fix the order of ground truth and predictions in the ``OpenMLRun`` object and in ``format_prediction``.
+* FIX #1198: Support numpy 1.24 and higher.
+* FIX #1216: Allow unknown task types on the server. This is only relevant when new task types are added to the test server.
+* FIX #1223: Fix mypy errors for implicit optional typing.
+* MAINT #1155: Add dependabot github action to automatically update other github actions.
+* MAINT #1199: Obtain pre-commit's flake8 from github.com instead of gitlab.com.
+* MAINT #1215: Support latest numpy version.
+* MAINT #1218: Test Python3.6 on Ubuntu 20.04 instead of the latest Ubuntu (which is 22.04).
+* MAINT #1221 #1212 #1206 #1211: Update github actions to the latest versions.
 
 0.13.0
 ~~~~~~

diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -23,7 +23,7 @@
 # NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
 dataset = openml.datasets.get_dataset(20)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=3)
 clf.fit(X, y)

diff --git a/examples/30_extended/configure_logging.py b/examples/30_extended/configure_logging.py
@@ -37,8 +37,8 @@
 
 import logging
 
-openml.config.console_log.setLevel(logging.DEBUG)
-openml.config.file_log.setLevel(logging.WARNING)
+openml.config.set_console_log_level(logging.DEBUG)
+openml.config.set_file_log_level(logging.WARNING)
 openml.datasets.get_dataset("iris")
 
 # Now the log level that was previously written to file should also be shown in the console.

diff --git a/examples/30_extended/custom_flow_.py b/examples/30_extended/custom_flow_.py
@@ -77,6 +77,8 @@
 # you can use the Random Forest Classifier flow as a *subflow*. It allows for
 # all hyperparameters of the Random Classifier Flow to also be specified in your pipeline flow.
 #
+# Note: you can currently only specific one subflow as part of the components.
+#
 # In this example, the auto-sklearn flow is a subflow: the auto-sklearn flow is entirely executed as part of this flow.
 # This allows people to specify auto-sklearn hyperparameters used in this flow.
 # In general, using a subflow is not required.
@@ -87,6 +89,8 @@
 autosklearn_flow = openml.flows.get_flow(9313)  # auto-sklearn 0.5.1
 subflow = dict(
     components=OrderedDict(automl_tool=autosklearn_flow),
+    # If you do not want to reference a subflow, you can use the following:
+    # components=OrderedDict(),
 )
 
 ####################################################################################################
@@ -124,7 +128,7 @@
     OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
 ]
 
-task_id = 1965  # Iris Task
+task_id = 1200  # Iris Task
 task = openml.tasks.get_task(task_id)
 dataset_id = task.get_dataset().dataset_id
 

diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -21,10 +21,9 @@
 #   * Use the output_format parameter to select output type
 #   * Default gives 'dict' (other option: 'dataframe', see below)
 #
-openml_list = openml.datasets.list_datasets()  # returns a dict
-
-# Show a nice table with some key data properties
-datalist = pd.DataFrame.from_dict(openml_list, orient="index")
+# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
+# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
+datalist = openml.datasets.list_datasets(output_format="dataframe")
 datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
 
 print(f"First 10 of {len(datalist)} datasets...")
@@ -65,23 +64,16 @@
 ############################################################################
 # Get the actual data.
 #
-# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy
-# sparse matrix, or as a Pandas DataFrame. The format is
-# controlled with the parameter ``dataset_format`` which can be either 'array'
-# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
-# and manually create a dataframe.
-X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
-)
-eeg = pd.DataFrame(X, columns=attribute_names)
-eeg["class"] = y
-print(eeg[:10])
+# openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
+# and also some additional metadata that we don't care about right now.
+eeg, *_ = dataset.get_data()
 
 ############################################################################
-# Instead of manually creating the dataframe, you can already request a
-# dataframe with the correct dtypes.
+# You can optionally choose to have openml separate out a column from the
+# dataset. In particular, many datasets for supervised problems have a set
+# `default_target_attribute` which may help identify the target variable.
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute, dataset_format="dataframe"
+    target=dataset.default_target_attribute
 )
 print(X.head())
 print(X.info())
@@ -92,6 +84,9 @@
 # data file. The dataset object can be used as normal.
 # Whenever you use any functionality that requires the data,
 # such as `get_data`, the data will be downloaded.
+# Starting from 0.15, not downloading data will be the default behavior instead.
+# The data will be downloading automatically when you try to access it through
+# openml objects, e.g., using `dataset.features`.
 dataset = openml.datasets.get_dataset(1471, download_data=False)
 
 ############################################################################
@@ -100,8 +95,8 @@
 # * Explore the data visually.
 eegs = eeg.sample(n=1000)
 _ = pd.plotting.scatter_matrix(
-    eegs.iloc[:100, :4],
-    c=eegs[:100]["class"],
+    X.iloc[:100, :4],
+    c=y[:100],
     figsize=(10, 10),
     marker="o",
     hist_kwds={"bins": 20},

diff --git a/examples/30_extended/fetch_runtimes_tutorial.py b/examples/30_extended/fetch_runtimes_tutorial.py
@@ -79,6 +79,7 @@
     )
 )
 
+
 # Creating utility function
 def print_compare_runtimes(measures):
     for repeat, val1 in measures["usercpu_time_millis_training"].items():

diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -27,7 +27,7 @@
 # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
 dataset = openml.datasets.get_dataset(68)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=1)
 clf.fit(X, y)
@@ -38,7 +38,7 @@
 # * e.g. categorical features -> do feature encoding
 dataset = openml.datasets.get_dataset(17)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 print(f"Categorical features: {categorical_indicator}")
 transformer = compose.ColumnTransformer(
@@ -160,7 +160,7 @@
     ]
 )
 
-run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
+run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
 myrun = run.publish()
 print(f"Uploaded to {myrun.openml_url}")
 
@@ -172,15 +172,14 @@
 
 # To perform the following line offline, it is required to have been called before
 # such that the task is cached on the local openml cache directory:
-task = openml.tasks.get_task(6)
+task = openml.tasks.get_task(96)
 
 # The following lines can then be executed offline:
 run = openml.runs.run_model_on_task(
     pipe,
     task,
     avoid_duplicate_runs=False,
     upload_flow=False,
-    dataset_format="array",
 )
 
 # The run may be stored offline, and the flow will be stored along with it:

diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
@@ -75,7 +75,7 @@
 
 # We'll take a random subset of at least ten tasks of all available tasks on
 # the test server:
-all_tasks = list(openml.tasks.list_tasks().keys())
+all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
 task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
 
 # The study needs a machine-readable and unique alias. To obtain this,