openml · mfeurer · Apr 25, 2023 · Apr 10, 2023 · Apr 10, 2023 · Apr 18, 2023
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
@@ -7,10 +7,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - name: Setup Python 3.7
+    - name: Setup Python 3.8
       uses: actions/setup-python@v4
       with:
-        python-version: 3.7
+        python-version: 3.8
     - name: Install pre-commit
       run: |
         pip install pre-commit

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 22.6.0
+    rev: 23.3.0
     hooks:
       - id: black
         args: [--line-length=100]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.961
+    rev: v1.2.0
     hooks:
       - id: mypy
         name: mypy openml
@@ -20,7 +20,7 @@ repos:
           - types-requests
           - types-python-dateutil
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 6.0.0
     hooks:
       - id: flake8
         name: flake8 openml

diff --git a/doc/progress.rst b/doc/progress.rst
@@ -9,15 +9,16 @@ Changelog
 0.13.1
 ~~~~~~
 
- * DOC #1241 #1229 #1231: Minor documentation fixes and resolve documentation examples not working.
  * ADD #1028: Add functions to delete runs, flows, datasets, and tasks (e.g., ``openml.datasets.delete_dataset``).
  * ADD #1144: Add locally computed results to the ``OpenMLRun`` object's representation if the run was created locally and not downloaded from the server.
  * ADD #1180: Improve the error message when the checksum of a downloaded dataset does not match the checksum provided by the API.
  * ADD #1201: Make ``OpenMLTraceIteration`` a dataclass.
  * DOC #1069: Add argument documentation for the ``OpenMLRun`` class.
+ * DOC #1241 #1229 #1231: Minor documentation fixes and resolve documentation examples not working.
  * FIX #1197 #559 #1131: Fix the order of ground truth and predictions in the ``OpenMLRun`` object and in ``format_prediction``.
  * FIX #1198: Support numpy 1.24 and higher.
  * FIX #1216: Allow unknown task types on the server. This is only relevant when new task types are added to the test server.
+ * FIX #1223: Fix mypy errors for implicit optional typing.
  * MAINT #1155: Add dependabot github action to automatically update other github actions.
  * MAINT #1199: Obtain pre-commit's flake8 from github.com instead of gitlab.com.
  * MAINT #1215: Support latest numpy version.

diff --git a/examples/30_extended/fetch_runtimes_tutorial.py b/examples/30_extended/fetch_runtimes_tutorial.py
@@ -79,6 +79,7 @@
     )
 )
 
+
 # Creating utility function
 def print_compare_runtimes(measures):
     for repeat, val1 in measures["usercpu_time_millis_training"].items():

diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -195,7 +195,7 @@ def _download_minio_bucket(
 def _download_text_file(
     source: str,
     output_path: Optional[str] = None,
-    md5_checksum: str = None,
+    md5_checksum: Optional[str] = None,
     exists_ok: bool = True,
     encoding: str = "utf8",
 ) -> Optional[str]:
@@ -326,7 +326,6 @@ def _send_request(request_method, url, data, files=None, md5_checksum=None):
                 if request_method == "get" and not __is_checksum_equal(
                     response.text.encode("utf-8"), md5_checksum
                 ):
-
                     # -- Check if encoding is not UTF-8 perhaps
                     if __is_checksum_equal(response.content, md5_checksum):
                         raise OpenMLHashException(

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -274,7 +274,6 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
         return [(key, fields[key]) for key in order if key in fields]
 
     def __eq__(self, other):
-
         if not isinstance(other, OpenMLDataset):
             return False
 

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -74,7 +74,6 @@ def list_datasets(
     output_format: str = "dict",
     **kwargs,
 ) -> Union[Dict, pd.DataFrame]:
-
     """
     Return a list of all dataset which are on OpenML.
     Supports large amount of results.
@@ -182,7 +181,6 @@ def _list_datasets(data_id: Optional[List] = None, output_format="dict", **kwarg
 
 
 def __list_datasets(api_call, output_format="dict"):
-
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
 
@@ -353,7 +351,7 @@ def get_datasets(
 def get_dataset(
     dataset_id: Union[int, str],
     download_data: bool = True,
-    version: int = None,
+    version: Optional[int] = None,
     error_if_multiple: bool = False,
     cache_format: str = "pickle",
     download_qualities: bool = True,
@@ -984,7 +982,7 @@ def _get_dataset_description(did_cache_dir, dataset_id):
 
 def _get_dataset_parquet(
     description: Union[Dict, OpenMLDataset],
-    cache_directory: str = None,
+    cache_directory: Optional[str] = None,
     download_all_files: bool = False,
 ) -> Optional[str]:
     """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.
@@ -1051,7 +1049,9 @@ def _get_dataset_parquet(
     return output_file_path
 
 
-def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: str = None) -> str:
+def _get_dataset_arff(
+    description: Union[Dict, OpenMLDataset], cache_directory: Optional[str] = None
+) -> str:
     """Return the path to the local arff file of the dataset. If is not cached, it is downloaded.
 
     Checks if the file is in the cache, if yes, return the path to the file.
@@ -1173,8 +1173,8 @@ def _create_dataset_from_description(
     description: Dict[str, str],
     features_file: str,
     qualities_file: str,
-    arff_file: str = None,
-    parquet_file: str = None,
+    arff_file: Optional[str] = None,
+    parquet_file: Optional[str] = None,
     cache_format: str = "pickle",
 ) -> OpenMLDataset:
     """Create a dataset object from a description dict.

diff --git a/openml/exceptions.py b/openml/exceptions.py
@@ -1,5 +1,7 @@
 # License: BSD 3-Clause
 
+from typing import Optional
+
 
 class PyOpenMLError(Exception):
     def __init__(self, message: str):
@@ -20,7 +22,7 @@ class OpenMLServerException(OpenMLServerError):
 
     # Code needs to be optional to allow the exception to be picklable:
     # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable  # noqa: E501
-    def __init__(self, message: str, code: int = None, url: str = None):
+    def __init__(self, message: str, code: Optional[int] = None, url: Optional[str] = None):
         self.message = message
         self.code = code
         self.url = url

diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py
@@ -166,7 +166,7 @@ def _run_model_on_fold(
         y_train: Optional[np.ndarray] = None,
         X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] = None,
     ) -> Tuple[np.ndarray, np.ndarray, "OrderedDict[str, float]", Optional["OpenMLRunTrace"]]:
-        """Run a model on a repeat,fold,subsample triplet of the task and return prediction information.
+        """Run a model on a repeat, fold, subsample triplet of the task.
 
         Returns the data that is necessary to construct the OpenML Run object. Is used by
         :func:`openml.runs.run_flow_on_task`.

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -1021,7 +1021,6 @@ def flatten_all(list_):
                     # when deserializing the parameter
                     sub_components_explicit.add(identifier)
                     if isinstance(sub_component, str):
-
                         external_version = self._get_external_version_string(None, {})
                         dependencies = self._get_dependencies()
                         tags = self._get_tags()
@@ -1072,7 +1071,6 @@ def flatten_all(list_):
                 parameters[k] = parameter_json
 
             elif isinstance(rval, OpenMLFlow):
-
                 # A subcomponent, for example the base model in
                 # AdaBoostClassifier
                 sub_components[k] = rval
@@ -1762,7 +1760,6 @@ def _prediction_to_probabilities(
         )
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-
             try:
                 proba_y = model_copy.predict_proba(X_test)
                 proba_y = pd.DataFrame(proba_y, columns=model_classes)  # handles X_test as numpy

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -120,7 +120,6 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow:
     try:
         return _get_cached_flow(flow_id)
     except OpenMLCacheException:
-
         xml_file = os.path.join(
             openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id),
             "flow.xml",
@@ -140,7 +139,6 @@ def list_flows(
     output_format: str = "dict",
     **kwargs
 ) -> Union[Dict, pd.DataFrame]:
-
     """
     Return a list of all flows which are on OpenML.
     (Supports large amount of results)
@@ -329,7 +327,6 @@ def get_flow_id(
 
 
 def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]:
-
     xml_string = openml._api_calls._perform_api_call(api_call, "get")
     flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",))
 
@@ -377,7 +374,7 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
 def assert_flows_equal(
     flow1: OpenMLFlow,
     flow2: OpenMLFlow,
-    ignore_parameter_values_on_older_children: str = None,
+    ignore_parameter_values_on_older_children: Optional[str] = None,
     ignore_parameter_values: bool = False,
     ignore_custom_name_if_none: bool = False,
     check_description: bool = True,

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -49,8 +49,8 @@ def run_model_on_task(
     model: Any,
     task: Union[int, str, OpenMLTask],
     avoid_duplicate_runs: bool = True,
-    flow_tags: List[str] = None,
-    seed: int = None,
+    flow_tags: Optional[List[str]] = None,
+    seed: Optional[int] = None,
     add_local_measures: bool = True,
     upload_flow: bool = False,
     return_flow: bool = False,
@@ -148,8 +148,8 @@ def run_flow_on_task(
     flow: OpenMLFlow,
     task: OpenMLTask,
     avoid_duplicate_runs: bool = True,
-    flow_tags: List[str] = None,
-    seed: int = None,
+    flow_tags: Optional[List[str]] = None,
+    seed: Optional[int] = None,
     add_local_measures: bool = True,
     upload_flow: bool = False,
     dataset_format: str = "dataframe",
@@ -438,7 +438,7 @@ def _run_task_get_arffcontent(
     extension: "Extension",
     add_local_measures: bool,
     dataset_format: str,
-    n_jobs: int = None,
+    n_jobs: Optional[int] = None,
 ) -> Tuple[
     List[List],
     Optional[OpenMLRunTrace],
@@ -505,7 +505,6 @@ def _calculate_local_measure(sklearn_fn, openml_name):
             user_defined_measures_fold[openml_name] = sklearn_fn(test_y, pred_y)
 
         if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
-
             for i, tst_idx in enumerate(test_indices):
                 if task.class_labels is not None:
                     prediction = (
@@ -549,7 +548,6 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                 )
 
         elif isinstance(task, OpenMLRegressionTask):
-
             for i, _ in enumerate(test_indices):
                 truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
                 arff_line = format_prediction(
@@ -570,7 +568,6 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                 )
 
         elif isinstance(task, OpenMLClusteringTask):
-
             for i, _ in enumerate(test_indices):
                 arff_line = [test_indices[i], pred_y[i]]  # row_id, cluster ID
                 arff_datacontent.append(arff_line)
@@ -579,7 +576,6 @@ def _calculate_local_measure(sklearn_fn, openml_name):
             raise TypeError(type(task))
 
         for measure in user_defined_measures_fold:
-
             if measure not in user_defined_measures_per_fold:
                 user_defined_measures_per_fold[measure] = OrderedDict()
             if rep_no not in user_defined_measures_per_fold[measure]:
@@ -625,7 +621,7 @@ def _run_task_get_arffcontent_parallel_helper(
     sample_no: int,
     task: OpenMLTask,
     dataset_format: str,
-    configuration: Dict = None,
+    configuration: Optional[Dict] = None,
 ) -> Tuple[
     np.ndarray,
     Optional[pd.DataFrame],
@@ -674,7 +670,12 @@ def _run_task_get_arffcontent_parallel_helper(
             sample_no,
         )
     )
-    pred_y, proba_y, user_defined_measures_fold, trace, = extension._run_model_on_fold(
+    (
+        pred_y,
+        proba_y,
+        user_defined_measures_fold,
+        trace,
+    ) = extension._run_model_on_fold(
         model=model,
         task=task,
         X_train=train_x,

diff --git a/openml/runs/trace.py b/openml/runs/trace.py
@@ -55,7 +55,7 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int:
             The trace iteration from the given fold and repeat that was
             selected as the best iteration by the search procedure
         """
-        for (r, f, i) in self.trace_iterations:
+        for r, f, i in self.trace_iterations:
             if r == repeat and f == fold and self.trace_iterations[(r, f, i)].selected is True:
                 return i
         raise ValueError(
@@ -345,7 +345,6 @@ def trace_from_xml(cls, xml):
 
     @classmethod
     def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace":
-
         merged_trace = (
             OrderedDict()
         )  # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration]  # noqa E501

diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -97,7 +97,7 @@ def get_setup(setup_id):
 
     try:
         return _get_cached_setup(setup_id)
-    except (openml.exceptions.OpenMLCacheException):
+    except openml.exceptions.OpenMLCacheException:
         url_suffix = "/setup/%d" % setup_id
         setup_xml = openml._api_calls._perform_api_call(url_suffix, "get")
         with io.open(setup_file, "w", encoding="utf8") as fh:

diff --git a/openml/study/study.py b/openml/study/study.py
@@ -73,7 +73,6 @@ def __init__(
         runs: Optional[List[int]],
         setups: Optional[List[int]],
     ):
-
         self.study_id = study_id
         self.alias = alias
         self.main_entity_type = main_entity_type
@@ -100,11 +99,11 @@ def id(self) -> Optional[int]:
 
     def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
         """Collect all information to display in the __repr__ body."""
-        fields = {
+        fields: Dict[str, Any] = {
             "Name": self.name,
             "Status": self.status,
             "Main Entity Type": self.main_entity_type,
-        }  # type: Dict[str, Any]
+        }
         if self.study_id is not None:
             fields["ID"] = self.study_id
             fields["Study URL"] = self.openml_url

diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -387,7 +387,6 @@ def get_task(
 
 
 def _get_task_description(task_id):
-
     try:
         return _get_cached_task(task_id)
     except OpenMLCacheException:

diff --git a/openml/tasks/split.py b/openml/tasks/split.py
@@ -70,7 +70,6 @@ def __eq__(self, other):
 
     @classmethod
     def _from_arff_file(cls, filename: str) -> "OpenMLSplit":
-
         repetitions = None
 
         pkl_filename = filename.replace(".arff", ".pkl.py3")