feat(config): structured config using pydantic

Hierarchical configuration in config.py provides context to options This setup prevents code duplication (functions with many arguments, various places to set defaults, docstrings...) Introducing new parameters is simplified Documentation is updated to reflect the new syntax BREAKING CHANGE: new configuration syntax
ing-bank · Jul 4, 2022 · bc52aeb · bc52aeb
1 parent fb65494
commit bc52aeb
Show file tree

Hide file tree

Showing 11 changed files with 531 additions and 771 deletions.
diff --git a/popmon/config.py b/popmon/config.py
@@ -16,66 +16,98 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from pathlib import Path
+from typing import Literal, Optional, Union
 
-from popmon.analysis.comparison.comparisons import Comparisons
-from popmon.analysis.profiling.profiles import Profiles
-
-profiles = Profiles.get_descriptions()
-
-
-comparisons = {
-    "ks": "Kolmogorov-Smirnov test statistic comparing each time slot to {ref}",
-    "ks_zscore": "Z-score of the Kolmogorov-Smirnov test, comparing each time slot with {ref}",
-    "ks_pvalue": "p-value of the Kolmogorov-Smirnov test, comparing each time slot with {ref}",
-    "pearson": "Pearson correlation between each time slot and {ref}",
-    "chi2": "Chi-squared test statistic, comparing each time slot with {ref}",
-    "chi2_norm": "Normalized chi-squared statistic, comparing each time slot with {ref}",
-    "chi2_pvalue": "p-value of the chi-squared statistic, comparing each time slot with {ref}",
-    "chi2_zscore": "Z-score of the chi-squared statistic, comparing each time slot with {ref}",
-    "chi2_max_residual": "The largest absolute normalized residual (|chi|) observed in all bin pairs "
-    + "(one histogram in a time slot and one in {ref})",
-    "chi2_spike_count": "The number of normalized residuals of all bin pairs (one histogram in a time"
-    + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).",
-    "unknown_labels": "Are categories observed in a given time slot that are not present in {ref}?",
-}
-comparisons.update(Comparisons.get_descriptions())
-
-references = {
-    "ref": "the reference data",
-    "roll": "a rolling window",
-    "prev1": "the preceding time slot",
-    "expanding": "all preceding time slots",
-}
-
-alerts = {
-    "n_green": "Total number of green traffic lights (observed for all statistics)",
-    "n_yellow": "Total number of  yellow traffic lights (observed for all statistics)",
-    "n_red": "Total number of red traffic lights (observed for all statistics)",
-    "worst": "Worst traffic light (observed for all statistics)",
-}
-
-section_descriptions = {
-    "profiles": """Basic statistics of the data (profiles) calculated for each time period (a period
-                   is represented by one bin). The yellow and red lines represent the corresponding
-                   traffic light bounds (default: 4 and 7 standard deviations with respect to the reference data).""",
-    "comparisons": "Statistical comparisons of each time period (one bin) to the reference data.",
-    "traffic_lights": "Traffic light calculation for different statistics (based on the calculated normalized residual, a.k.a. pull). Statistics for which all traffic lights are green are hidden from view by default.",
-    "alerts": "Alerts aggregated by all traffic lights for each feature.",
-    "histograms": "Histograms of the last few time slots (default: 2).",
-    "overview": "Alerts aggregated per feature",
-}
-
-histograms = {
-    "heatmap": "The heatmap shows the frequency of each value over time. If a variable has a high number of distinct values"
-    "(i.e. has a high cardinality), then the most frequent values are displayed and the remaining are grouped as 'Others'. "
-    "The maximum number of values to should is configurable (default: 20).",
-    "heatmap_column_normalized": "The column-normalized heatmap allows for comparing of time bins when the counts in each bin vary.",
-    "heatmap_row_normalized": "The row-normalized heatmaps allows for monitoring one value over time.",
-}
-
-config = {
-    "section_descriptions": section_descriptions,
-    "limited_stats": [
+from pydantic import BaseModel, BaseSettings
+from pydantic.fields import Field
+
+# Global configuration for the joblib parallelization. Could be used to change the number of jobs, and/or change
+# the backend from default (loki) to 'multiprocessing' or 'threading'.
+# (see https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html for details)
+parallel_args = {"n_jobs": 1}
+
+# Usage the `ing_matplotlib_theme`
+themed = True
+
+
+class ProfilesSection(BaseModel):
+    name = "Profiles"
+    description = """Basic statistics of the data (profiles) calculated for each time period (a period
+                       is represented by one bin). The yellow and red lines represent the corresponding
+                       traffic light bounds (default: 4 and 7 standard deviations with respect to the reference data)."""
+
+
+class AlertSection(BaseModel):
+    name = "Alerts"
+    description = "Alerts aggregated by all traffic lights for each feature."
+
+    descriptions = {
+        "n_green": "Total number of green traffic lights (observed for all statistics)",
+        "n_yellow": "Total number of  yellow traffic lights (observed for all statistics)",
+        "n_red": "Total number of red traffic lights (observed for all statistics)",
+    }
+
+
+class HistogramSectionModel(BaseModel):
+    name = "Histograms"
+    description = "Histograms of the last few time slots (default: 2)."
+
+    hist_names: list[
+        Literal["heatmap", "heatmap_column_normalized", "heatmap_row_normalized"]
+    ] = [
+        "heatmap",
+        "heatmap_column_normalized",
+        "heatmap_row_normalized",
+    ]
+    hist_names_formatted = {
+        "heatmap": "Heatmap",
+        "heatmap_column_normalized": "Column-Normalized Heatmap",
+        "heatmap_row_normalized": "Row-Normalized Heatmap",
+    }
+    descriptions = {
+        "heatmap": "The heatmap shows the frequency of each value over time. If a variable has a high number of distinct values"
+        "(i.e. has a high cardinality), then the most frequent values are displayed and the remaining are grouped as 'Others'. "
+        "The maximum number of values to should is configurable (default: 20).",
+        "heatmap_column_normalized": "The column-normalized heatmap allows for comparing of time bins when the counts in each bin vary.",
+        "heatmap_row_normalized": "The row-normalized heatmaps allows for monitoring one value over time.",
+    }
+    plot_hist_n: int = 2
+    cmap: str = "autumn_r"
+
+
+class TrafficLightsSection(BaseModel):
+    name = "Traffic Lights"
+    description = "Traffic light calculation for different statistics (based on the calculated normalized residual, a.k.a. pull). Statistics for which all traffic lights are green are hidden from view by default."
+
+
+class ComparisonsSection(BaseModel):
+    name = "Comparisons"
+    description = (
+        "Statistical comparisons of each time period (one bin) to the reference data."
+    )
+
+
+class OverviewSection(BaseModel):
+    name = "Overview"
+    description = "Alerts aggregated per feature"
+
+
+class Section(BaseModel):
+    profiles: ProfilesSection = ProfilesSection()
+    alerts: AlertSection = AlertSection()
+    histograms: HistogramSectionModel = HistogramSectionModel()
+    overview: OverviewSection = OverviewSection()
+    comparisons: ComparisonsSection = ComparisonsSection()
+    traffic_lights: TrafficLightsSection = TrafficLightsSection()
+
+
+def get_stats():
+    from popmon.analysis.comparison.comparisons import Comparisons
+
+    comparisons = Comparisons.get_descriptions()
+
+    stats = [
         "distinct*",
         "filled*",
         "nan*",
@@ -90,46 +122,70 @@
         "phik*",
         "*unknown_labels*",
         "*chi2_norm*",
-        "*ks*",
         "*zscore*",
         "n_*",
-        "worst",
-    ],
-}
-for key in Comparisons.get_comparisons().keys():
-    config["limited_stats"].append(f"*{key}*")
-
-
-def get_stat_description(name: str):
-    """Gets the description of a statistic.
-
-    :param str name: the name of the statistic.
-
-    :returns str: the description of the statistic. If not found, returns an empty string
-    """
-    if not isinstance(name, str):
-        raise TypeError("Statistic's name should be a string.")
-
-    if name in histograms:
-        return histograms[name]
-    if name in profiles:
-        return profiles[name]
-    if name in alerts:
-        return alerts[name]
-
-    head, *tail = name.split("_")
-    tail = "_".join(tail)
-
-    if tail in comparisons and head in references:
-        return comparisons[tail].format(ref=references[head])
-
-    return ""
-
-
-# Global configuration for the joblib parallelization. Could be used to change the number of jobs, and/or change
-# the backend from default (loki) to 'multiprocessing' or 'threading'.
-# (see https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html for details)
-parallel_args = {"n_jobs": 1}
-
-# Usage the `ing_matplotlib_theme`
-themed = True
+    ]
+
+    for key in comparisons.keys():
+        stats.append(f"*{key}*")
+
+    return stats
+
+
+class Report(BaseModel):
+    """Report-specific configuration"""
+
+    skip_empty_plots: bool = True
+    last_n: int = 0
+    skip_first_n: int = 0
+    skip_last_n: int = 0
+    report_filepath: Optional[Union[str, Path]] = None
+    # if set to false, then smaller show_stats
+    # if limited report is selected, check if stats list is provided, if not, get a default minimal list
+    # show_stats = show_stats if not extended_report else None
+    extended_report: bool = True
+    show_stats: list[str] = Field(default_factory=get_stats)
+    section: Section = Section()
+    top_n: int = 20
+
+
+class Comparison(BaseModel):
+    window = 10
+    shift = 1
+
+
+class Monitoring(BaseModel):
+    monitoring_rules: dict[str, list[float]] = {
+        "*_pull": [7, 4, -4, -7],
+        "*_zscore": [7, 4, -4, -7],
+        "[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
+    }
+    pull_rules: dict[str, list[float]] = {"*_pull": [7, 4, -4, -7]}
+
+
+class Settings(BaseSettings):
+    report: Report = Report()
+    comparison: Comparison = Comparison()
+    monitoring: Monitoring = Monitoring()
+
+    @classmethod
+    def get_keys(cls):
+        aliases = {}
+        ambiguous = []
+        for key, value in cls.schema()["properties"].items():
+            if key in aliases:
+                ambiguous.append(key)
+                del aliases[key]
+            elif key in ambiguous:
+                continue
+
+            if "allOf" in value:
+                for skey, svalue in value["default"].items():
+                    if skey in aliases:
+                        ambiguous.append(key)
+                        del aliases[key]
+                    else:
+                        aliases[skey] = (key, skey)
+            else:
+                aliases[key] = key
+        return aliases
diff --git a/popmon/pipeline/amazing_pipeline.py b/popmon/pipeline/amazing_pipeline.py
@@ -23,7 +23,6 @@
 from popmon import resources
 
 from ..base import Pipeline
-from ..config import config
 from ..io import JsonReader
 from ..pipeline.report_pipelines import SelfReference
 
@@ -47,7 +46,6 @@ def run():
     )
 
     cfg = {
-        **config,
         "histograms_path": resources.data("synthetic_histograms.json"),
         "hists_key": "hists",
         "ref_hists_key": "hists",
@@ -60,7 +58,6 @@ def run():
             "*_zscore": [7, 4, -4, -7],
         },
         "pull_rules": {"*_pull": [7, 4, -4, -7]},
-        "show_stats": config["limited_stats"],
     }
 
     pipeline = AmazingPipeline(**cfg)