BUILD: add support for mypy (#324)

* BUILD: add first part of mypy support * BUILD: introduce minor changes * BUILD: refactor inspection methods * BUILD: add py.typed file * BUILD: drop python 3.6 support * REFACTOR: streamline code for affinity matrix calculations * API: remove unsupported method fit_predict from CandidateEstimatorDF * FIX: call candidate.predict(), not candidate.predic() * API: raise exception on CandidateEstimatorDF.xyz() with empty candidate * API: make type of CandidateEstimatorDF's 'candidate' param more specific * BUILD: apply review * API: pass explainer + interactions flag to ExplainerJob not explainer_fn * API: remove redundant arg explainer from __feature_affinity_matrix() * REFACTOR: use to_list with the 'optional' flag * BUILD: add cast for output_name * API: make method FittableMixin.ensure_fitted public * BUILD: update environment.yml * BUILD: update pre-commit config to load pytools and sklearndf for mypy * API: remove obsolete code to compute the std of affinity scores * API: remove obsolete properties RangePartitioner.…_bound * FIX: ensure_fitted() is now a public method * FIX: address remaining mypy errors Co-authored-by: Jan Ittner <ittner.jan@bcg.com>
BCG-X-Official · Mar 7, 2022 · 0cd01d0 · 0cd01d0
1 parent 21db1a3
commit 0cd01d0
Show file tree

Hide file tree

Showing 17 changed files with 402 additions and 261 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,3 +26,13 @@ repos:
       - id: check-added-large-files
       - id: check-json
       - id: check-yaml
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.931
+    hooks:
+      - id: mypy
+        files: src/
+        additional_dependencies:
+          - numpy>=1.22
+          - gamma-pytools>=2dev6,<3a
+          - sklearndf>=2dev3,<3a
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -79,6 +79,20 @@ stages:
               python -m pip install flake8==3.9.0 flake8-comprehensions flake8-import-order
               python -m flake8 --config tox.ini -v .
             displayName: 'Run flake8'
+      - job:
+        displayName: 'mypy'
+        steps:
+          - task: UsePythonVersion@0
+            inputs:
+              versionSpec: '3.7.*'
+            displayName: 'use Python 3.7'
+          - script: |
+              set -eux
+              eval "$(conda shell.bash hook)"
+              conda env create
+              conda activate $(project_name)-develop
+              python -m mypy src
+            displayName: 'Run mypy'
 
   # detect whether the build config (pyproject.toml) was changed -> then we must run a build test
   - stage: detect_build_config_changes

diff --git a/environment.yml b/environment.yml
@@ -5,7 +5,7 @@ channels:
 dependencies:
   # run
   - boruta_py ~= 0.3
-  - gamma-pytools >= 2.0.dev5, <3a
+  - gamma-pytools >= 2dev6, <3a
   - joblib ~= 1.1
   - lightgbm ~= 3.3
   - matplotlib ~= 3.5
@@ -15,7 +15,7 @@ dependencies:
   - scikit-learn ~= 1.0.2
   - scipy ~= 1.8
   - shap ~=0.40
-  - sklearndf >= 2dev2, < 3a
+  - sklearndf >= 2dev3, < 3a
   # build/test
   - black ~= 22.1
   - conda-build ~= 3.21
@@ -26,20 +26,21 @@ dependencies:
   - jinja2 ~= 2.11
   - markupsafe < 2.1  # markupsafe 2.1 breaks support for jinja2
   - m2r ~= 0.2
+  - mypy ~= 0.931
   - pluggy ~= 0.13
   - pre-commit ~= 2.17
   - pydata-sphinx-theme ~= 0.7
   - pytest ~= 5.4
   - pytest-cov ~= 2.12
   - pyyaml ~= 5.4
-  - sphinx ~= 4.2
+  - sphinx ~= 4.4
   - sphinx-autodoc-typehints ~= 1.12
   - toml ~= 0.10
   - tox ~= 3.24
   - yaml ~= 0.2
   # notebooks
   - jupyterlab ~= 3.2
   - nbclassic ~= 0.3
-  - nbsphinx ~= 0.7.1
+  - nbsphinx ~= 0.8.8
   - openpyxl ~= 3.0
   - seaborn ~= 0.11
diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,35 @@
+[mypy]
+show_error_codes = True
+allow_redefinition = True
+
+[mypy-catboost.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
+
+[mypy-matplotlib.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
+
+[mypy-mpl_toolkits.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
+
+[mypy-pandas.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
+
+[mypy-pytools.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
+
+[mypy-scipy.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
+
+[mypy-sklearn.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
+
+[mypy-shap.*]
+; TODO remove once PEP 561 is supported
+ignore_missing_imports = True
diff --git a/src/facet/data/partition/_partition.py b/src/facet/data/partition/_partition.py
@@ -6,8 +6,7 @@
 import math
 import operator as op
 from abc import ABCMeta, abstractmethod
-from numbers import Number
-from typing import Any, Generic, Iterable, Optional, Sequence, Tuple, TypeVar
+from typing import Any, Generic, Optional, Sequence, Tuple, TypeVar
 
 import numpy as np
 
@@ -29,10 +28,12 @@
 # Type variables
 #
 
-
-T_Self = TypeVar("T_Self")
+T_Partitioner = TypeVar("T_Partitioner", bound="Partitioner")
+T_RangePartitioner = TypeVar("T_RangePartitioner", bound="RangePartitioner")
+T_CategoryPartitioner = TypeVar("T_CategoryPartitioner", bound="CategoryPartitioner")
 T_Values = TypeVar("T_Values")
-T_Values_Numeric = TypeVar("T_Values_Numeric", bound=Number)
+T_Values_Numeric = TypeVar("T_Values_Numeric", float, int)
+
 
 #
 # Ensure all symbols introduced below are included in __all__
@@ -47,7 +48,7 @@
 
 
 class Partitioner(
-    FittableMixin[Iterable[T_Values]], Generic[T_Values], metaclass=ABCMeta
+    FittableMixin[Sequence[T_Values]], Generic[T_Values], metaclass=ABCMeta
 ):
     """
     Abstract base class of all partitioners.
@@ -56,7 +57,7 @@ class Partitioner(
     DEFAULT_MAX_PARTITIONS = 20
 
     #: The values representing the partitions.
-    _partitions: Optional[np.ndarray]
+    _partitions: Optional[Sequence[T_Values]]
 
     #: The count of values allocated to each partition.
     _frequencies: Optional[np.ndarray]
@@ -76,7 +77,8 @@ def __init__(self, max_partitions: Optional[int] = None) -> None:
         self._partitions = None
         self._frequencies = None
 
-    __init__.__doc__ = __init__.__doc__.replace(
+    # mypy - incorrect type inference for __doc__
+    __init__.__doc__ = __init__.__doc__.replace(  # type: ignore
         "{DEFAULT_MAX_PARTITIONS}", repr(DEFAULT_MAX_PARTITIONS)
     )
 
@@ -88,19 +90,23 @@ def max_partitions(self) -> int:
         return self._max_partitions
 
     @property
-    def partitions_(self) -> np.ndarray:
+    def partitions_(self) -> Sequence[T_Values]:
         """
         The values representing the partitions.
         """
+
         self.ensure_fitted()
+        assert self._partitions is not None, "Partitioner is fitted"
         return self._partitions
 
     @property
     def frequencies_(self) -> np.ndarray:
         """
         The count of values allocated to each partition.
         """
+
         self.ensure_fitted()
+        assert self._frequencies is not None, "Partitioner is fitted"
         return self._frequencies
 
     @property
@@ -111,7 +117,12 @@ def is_categorical(self) -> bool:
         """
 
     @abstractmethod
-    def fit(self: T_Self, values: Iterable[T_Values], **fit_params: Any) -> T_Self:
+    def fit(  # type: ignore[override]
+        # todo: remove 'type: ignore' once mypy correctly infers return type
+        self: T_Partitioner,
+        values: Sequence[T_Values],
+        **fit_params: Any,
+    ) -> T_Partitioner:
         """
         Calculate the partitioning for the given observed values.
 
@@ -122,7 +133,7 @@ def fit(self: T_Self, values: Iterable[T_Values], **fit_params: Any) -> T_Self:
         """
 
     @staticmethod
-    def _as_non_empty_array(values: Iterable[Any]) -> np.ndarray:
+    def _as_non_empty_array(values: Sequence[Any]) -> np.ndarray:
         # ensure arg values is a non-empty array
         values = np.asarray(values)
         if len(values) == 0:
@@ -148,24 +159,6 @@ def __init__(self, max_partitions: Optional[int] = None) -> None:
             Sequence[Tuple[T_Values_Numeric, T_Values_Numeric]]
         ] = None
 
-    @property
-    def lower_bound(self) -> T_Values_Numeric:
-        """
-        The lower bound of the partitioning.
-
-        ``Null`` if no explicit lower bound is set.
-        """
-        return self._lower_bound
-
-    @property
-    def upper_bound(self) -> T_Values_Numeric:
-        """
-        The upper bound of the partitioning.
-
-        ``Null`` if no explicit upper bound is set.
-        """
-        return self._upper_bound
-
     @property
     def is_categorical(self) -> bool:
         """
@@ -182,26 +175,30 @@ def partition_bounds_(self) -> Sequence[Tuple[T_Values_Numeric, T_Values_Numeric
           inclusive lower bound of a partition range, and y is the exclusive upper
           bound of a partition range
         """
+
         self.ensure_fitted()
+        assert self._partition_bounds is not None, "Partitioner is fitted"
         return self._partition_bounds
 
     @property
     def partition_width_(self) -> T_Values_Numeric:
         """
         The width of each partition.
         """
+
         self.ensure_fitted()
+        assert self._step is not None, "Partitioner is fitted"
         return self._step
 
-    # noinspection PyMissingOrEmptyDocstring,PyIncorrectDocstring
-    def fit(
-        self: T_Self,
-        values: Iterable[T_Values_Numeric],
+    def fit(  # type: ignore[override]
+        # todo: remove 'type: ignore' once mypy correctly infers return type
+        self: T_RangePartitioner,
+        values: Sequence[T_Values_Numeric],
         *,
         lower_bound: Optional[T_Values_Numeric] = None,
         upper_bound: Optional[T_Values_Numeric] = None,
         **fit_params: Any,
-    ) -> T_Self:
+    ) -> T_RangePartitioner:
         r"""
         Calculate the partitioning for the given observed values.
 
@@ -220,8 +217,6 @@ def fit(
         :return: ``self``
         """
 
-        self: RangePartitioner  # support type hinting in PyCharm
-
         values = self._as_non_empty_array(values)
 
         if lower_bound is None or upper_bound is None:
@@ -305,10 +300,9 @@ def _ceil_step(step: float):
 
         return min(10 ** math.ceil(math.log10(step * m)) / m for m in [1, 2, 5])
 
-    @staticmethod
     @abstractmethod
     def _step_size(
-        lower_bound: T_Values_Numeric, upper_bound: T_Values_Numeric
+        self, lower_bound: T_Values_Numeric, upper_bound: T_Values_Numeric
     ) -> T_Values_Numeric:
         # Compute the step size (interval length) used in the partitions
         pass
@@ -348,6 +342,7 @@ def _step_size(self, lower_bound: float, upper_bound: float) -> float:
 
     @property
     def _partition_center_offset(self) -> float:
+        assert self._step is not None, "Partitioner is fitted"
         return self._step / 2
 
 
@@ -384,6 +379,7 @@ def _step_size(self, lower_bound: int, upper_bound: int) -> int:
 
     @property
     def _partition_center_offset(self) -> int:
+        assert self._step is not None, "Partitioner is fitted"
         return self._step // 2
 
 
@@ -409,11 +405,14 @@ def is_categorical(self) -> bool:
         return True
 
     # noinspection PyMissingOrEmptyDocstring
-    def fit(self: T_Self, values: Iterable[T_Values], **fit_params: Any) -> T_Self:
+    def fit(  # type: ignore[override]
+        # todo: remove 'type: ignore' once mypy correctly infers return type
+        self: T_CategoryPartitioner,
+        values: Sequence[T_Values],
+        **fit_params: Any,
+    ) -> T_CategoryPartitioner:
         """[see superclass]"""
 
-        self: CategoryPartitioner  # support type hinting in PyCharm
-
         values = self._as_non_empty_array(values)
 
         partitions, frequencies = np.unique(values, return_counts=True)