diff --git a/popmon/analysis/comparison/__init__.py b/popmon/analysis/comparison/__init__.py index 906f5b02..209fa8e2 100644 --- a/popmon/analysis/comparison/__init__.py +++ b/popmon/analysis/comparison/__init__.py @@ -18,6 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import popmon.analysis.comparison.comparisons # noqa + from ...analysis.comparison.hist_comparer import ( ExpandingHistComparer, ExpandingNormHistComparer, diff --git a/popmon/analysis/comparison/comparisons.py b/popmon/analysis/comparison/comparisons.py index eb88f2a0..099b850c 100644 --- a/popmon/analysis/comparison/comparisons.py +++ b/popmon/analysis/comparison/comparisons.py @@ -16,26 +16,259 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from typing import Callable -class Comparisons: - _comparison_descriptions = {} - _comparison_funcs = {} +import numpy as np +from scipy import stats - @classmethod - def register(cls, key: str, description: str): - def f(func: Callable): - cls._comparison_descriptions[key] = description - cls._comparison_funcs[key] = func - return func +from popmon.analysis.comparison.comparison_registry import Comparisons - return f - @classmethod - def get_comparisons(cls): - return cls._comparison_funcs +@Comparisons.register( + key="max_prob_diff", + description="The largest absolute difference between all bin pairs of two normalized histograms (one histogram in a time slot and one in {ref})", +) +def googl_test(bins_1, bins_2): + """Google-paper test - @classmethod - def get_descriptions(cls): - return cls._comparison_descriptions + Reference link: https://mlsys.org/Conferences/2019/doc/2019/167.pdf + + :param bins_1: first array of bin entries + :param bins_2: second array of entries + + :return: maximum difference between the two entry distributions + :rtype: float + """ + + def dist(bins): + sum_ = np.sum(bins) + return bins / sum_ if sum_ else bins + + return np.max(np.abs(dist(bins_1) - dist(bins_2))) + + +@Comparisons.register(key="psi", description="Population Stability Index") +def population_stability_index(po, qo): + epsilon = 10e-6 + p = po.copy() + q = qo.copy() + p += epsilon + q += epsilon + return np.sum((p - q) * np.log(p / q)) + + +def kullback_leibler_divergence(po, qo): + epsilon = 10e-6 + p = po.copy() + q = qo.copy() + p += epsilon + q += epsilon + return np.sum(p * np.log(p / q)) + + +@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence") +def jensen_shannon_divergence(p, q): + m = 0.5 * (p + q) + return 0.5 * (kullback_leibler_divergence(p, m) + kullback_leibler_divergence(q, m)) + + +def ks_test(hist_1, hist_2): + """KS-test for two histograms with different number of entries + + Copyright ROOT: + Formulas translated from c++ to python, but formulas otherwise not modified. + Reference: link: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest + GNU license: https://root.cern.ch/license + All modifications copyright ING WBAA. + + :param hist_1: 1D array with bin counts of the histogram_1 + :param hist_2: 1D array with bin counts of the histogram_2 + + :return: ks_score: Kolmogorov-Smirnov Test score + :rtype: float + """ + if len(hist_1) == 0 or len(hist_2) == 0: + raise ValueError("Input histogram(s) has zero size.") + if len(hist_1) != len(hist_2): + raise ValueError("Input histograms have unequal size.") + + sum_1 = np.sum(hist_1) + sum_2 = np.sum(hist_2) + if sum_1 == 0 or sum_2 == 0: + return np.nan + + normalized_cumsum_1 = np.cumsum(hist_1) / sum_1 + normalized_cumsum_2 = np.cumsum(hist_2) / sum_2 + + d = np.abs(normalized_cumsum_1 - normalized_cumsum_2) + + return np.max(d) * np.sqrt(sum_1 * sum_2 / (sum_1 + sum_2)) + + +def ks_prob(testscore): + """KS-probability corresponding ti KS test score + + Copyright ROOT: + Formulas translated from c++ to python, but formulas otherwise not modified. + Reference: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest + GNU license: https://root.cern.ch/license + All modifications copyright ING WBAA. + + :param float testscore: Kolmogorov-Smirnov test score + + :return: approximate pvalue for the Kolmogorov-Smirnov test score + :rtype: float + """ + fj = np.array([-2, -8, -18, -32]) + r = np.zeros(4) + + w = 2.50662827 + c = np.array([-1.2337005501361697, -11.103304951225528, -30.842513753404244]) + + u = abs(testscore) + pvalue = np.nan + if u < 0.2: + pvalue = 1 + elif u < 0.755: + v = np.power(u, -2) + pvalue = 1 - w * np.exp(c * v).sum() / u + elif u < 6.8116: + v = np.power(u, 2) + max_j = int(max(1, round(3.0 / u))) + r[:max_j] = np.exp(fj[:max_j] * v) + pvalue = 2 * (r[0] - r[1] + r[2] - r[3]) + + return pvalue + + +@Comparisons.register( + key=["ks", "ks_pvalue", "ks_zscore"], + description=[ + "Kolmogorov-Smirnov test statistic comparing each time slot to {ref}", + "p-value of the Kolmogorov-Smirnov test, comparing each time slot with {ref}", + "Z-score of the Kolmogorov-Smirnov test, comparing each time slot with {ref}", + ], + dim=1, + htype="num", +) +def ks(p, q, *args): + # KS-test only properly defined for (ordered) 1D interval variables + ks_testscore = ks_test(p, q) + ks_pvalue = ks_prob(ks_testscore) + ks_zscore = -stats.norm.ppf(ks_pvalue) + return ks_testscore, ks_pvalue, ks_zscore + + +@Comparisons.register( + key="unknown_labels", + description="Are categories observed in a given time slot that are not present in {ref}?", + dim=1, + htype="cat", +) +def unknown_labels(hist1, hist2): + # check consistency of bin_labels + labels1 = hist1.keySet + labels2 = hist2.keySet + subset = labels1 <= labels2 + return int(not subset) + + +@Comparisons.register( + key="pearson", + description="Pearson correlation between each time slot and {ref}", + dim=(2,), +) +def pearson(p, q, *args): + # calculate pearson coefficient + pearson_coeff = np.nan + if len(p) >= 2: + same0 = all(p == p[0]) + same1 = all(q == q[0]) + if not same0 and not same1: + # this avoids std==0, and thereby avoid runtime warnings + pearson_coeff, _ = stats.pearsonr(p, q) + return pearson_coeff + + +def uu_chi2(n, m): + """Normalized Chi^2 formula for two histograms with different number of entries + + Copyright ROOT: + Formulas translated from c++ to python, but formulas otherwise not modified. + Reference: https://root.cern.ch/doc/master/classTH1.html#a6c281eebc0c0a848e7a0d620425090a5 + GNU License: https://root.cern.ch/license + All modifications copyright ING WBAA. + + :param n: 1d array with bin counts of the reference set + :param m: 1d array with bin counts of the test set + :return: tuple of floats (chi2_value, chi2_norm, z_score, p_value, res) + """ + + def _not_finite_to_zero(x): + res = x.copy() + res[~np.isfinite(res)] = 0 + return res + + if len(n) == 0 or len(m) == 0: + raise ValueError("Input histogram(s) has zero size.") + if len(n) != len(m): + raise ValueError("Input histograms have unequal size.") + + N = np.sum(n) + M = np.sum(m) + + if N == 0 or M == 0: + return np.nan, np.nan, np.nan, np.nan, [0] * len(n) + + # remove all zero entries in the sum, to present division by zero for individual bins + z = n + m + n = n[z != 0] + m = m[z != 0] + + dof = ((n != 0) | (m != 0)).sum() - 1 + chi2_value = _not_finite_to_zero(((M * n - N * m) ** 2) / (n + m)).sum() / M / N + + chi2_norm = chi2_value / dof if dof > 0 else np.nan + p_value = stats.chi2.sf(chi2_value, dof) + z_score = -stats.norm.ppf(p_value) + + p = (n + m) / (N + M) + + if (p == 1).any(): + # unusual case of (only) one bin with p==1, avoids division with zero below + res = np.array([np.nan] * len(p)) + else: + res = _not_finite_to_zero( + (n - N * p) / np.sqrt(N * p) / np.sqrt((1 - N / (N + M)) * (1 - p)) + ) + + return chi2_value, chi2_norm, z_score, p_value, res + + +@Comparisons.register( + key=[ + "chi2", + "chi2_norm", + "chi2_zscore", + "chi2_pvalue", + "chi2_max_residual", + "chi2_spike_count", + ], + description=[ + "Chi-squared test statistic, comparing each time slot with {ref}", + "Normalized chi-squared statistic, comparing each time slot with {ref}", + "Z-score of the chi-squared statistic, comparing each time slot with {ref}", + "p-value of the chi-squared statistic, comparing each time slot with {ref}", + "The largest absolute normalized residual (|chi|) observed in all bin pairs " + + "(one histogram in a time slot and one in {ref})", + "The number of normalized residuals of all bin pairs (one histogram in a time" + + " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).", + ], +) +def chi2(*args, max_res_bound=7.0): + chi2r, chi2_norm, zscore, pvalue, res = uu_chi2(*args) + abs_residual = np.abs(res) + chi2_max_residual = np.max(abs_residual) + chi2_spike_count = np.sum(abs_residual[abs_residual > max_res_bound]) + + return chi2r, chi2_norm, zscore, pvalue, chi2_max_residual, chi2_spike_count diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 63f22dd7..750ea2f7 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -20,7 +20,6 @@ import numpy as np import pandas as pd -from scipy.stats import norm, pearsonr from ...analysis.apply_func import ApplyFunc from ...analysis.functions import ( @@ -39,10 +38,9 @@ ) from ...base import Pipeline from ...hist.hist_utils import COMMON_HIST_TYPES, is_numeric -from ...stats.numpy import ks_prob, ks_test, uu_chi2 -def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): +def hist_compare(row, hist_name1="", hist_name2=""): """Function to compare two histograms Apply statistical tests to compare two input histograms, such as: @@ -52,28 +50,11 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): :param pd.Series row: row to apply compare function to :param str hist_name1: name of histogram one to compare :param str hist_name2: name of histogram two to compare - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. :return: pandas Series with popular comparison metrics. """ - from .comparisons import Comparisons - - x = { - "ks": np.nan, - "ks_zscore": np.nan, - "ks_pvalue": np.nan, - "pearson": np.nan, - "chi2": np.nan, - "chi2_norm": np.nan, - "chi2_zscore": np.nan, - "chi2_pvalue": np.nan, - "chi2_max_residual": np.nan, - "chi2_spike_count": np.nan, - "unknown_labels": np.nan, - } - - for key in Comparisons.get_comparisons().keys(): - x[key] = np.nan + from .comparison_registry import Comparisons + + x = {key: np.nan for key in Comparisons.get_keys()} # basic name checks cols = row.index.to_list() @@ -93,48 +74,48 @@ def hist_compare(row, hist_name1="", hist_name2="", max_res_bound=7.0): # compare if hist1.n_dim == 1: + entries_list = get_consistent_numpy_entries([hist1, hist2]) if is_numeric(hist1): - # KS-test only properly defined for (ordered) 1D interval variables - entries_list = get_consistent_numpy_entries([hist1, hist2]) - ks_testscore = ks_test(*entries_list) - x["ks"] = ks_testscore - ks_pvalue = ks_prob(ks_testscore) - x["ks_pvalue"] = ks_pvalue - x["ks_zscore"] = -norm.ppf(ks_pvalue) - else: # categorical - entries_list = get_consistent_numpy_entries([hist1, hist2]) - # check consistency of bin_labels - labels1 = hist1.keySet - labels2 = hist2.keySet - subset = labels1 <= labels2 - x["unknown_labels"] = int(not subset) + htype = "num" + args = entries_list + else: + htype = "cat" + args = [hist1, hist2] + + for key, func in Comparisons.get_comparisons(dim=1, htype=htype).items(): + results = func(*args) + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + x[k] = v + + for key, func in Comparisons.get_comparisons(dim=1, htype="all").items(): + results = func(*entries_list) + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + x[k] = v else: numpy_ndgrids = get_consistent_numpy_ndgrids([hist1, hist2], dim=hist1.n_dim) entries_list = [entry.flatten() for entry in numpy_ndgrids] - # calculate pearson coefficient - pearson, pvalue = (np.nan, np.nan) - if len(entries_list[0]) >= 2: - same0 = all(entries_list[0] == entries_list[0][0]) - same1 = all(entries_list[1] == entries_list[1][0]) - if not same0 and not same1: - # this avoids std==0, and thereby avoid runtime warnings - pearson, pvalue = pearsonr(*entries_list) - - chi2, chi2_norm, zscore, pvalue, res = uu_chi2(*entries_list) - abs_residual = np.abs(res) - chi2_max_residual = np.max(abs_residual) - chi2_spike_count = np.sum(abs_residual[abs_residual > max_res_bound]) - - x["pearson"] = pearson - x["chi2"] = chi2 - x["chi2_norm"] = chi2_norm - x["chi2_zscore"] = zscore - x["chi2_pvalue"] = pvalue - x["chi2_max_residual"] = chi2_max_residual - x["chi2_spike_count"] = chi2_spike_count - for key, func in Comparisons.get_comparisons().items(): - x[key] = func(*entries_list) + for key, func in Comparisons.get_comparisons(dim=(2,)).items(): + results = func(*entries_list) + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + x[k] = v + + for key, func in Comparisons.get_comparisons(dim=-1).items(): + results = func(*entries_list) + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + x[k] = v + + if len(set(x.keys()) - set(Comparisons.get_keys())) > 0: + raise ValueError("Could not compute full comparison") + return pd.Series(x) @@ -149,7 +130,6 @@ def __init__( assign_to_key=None, hist_col="histogram", suffix="comp", - max_res_bound=7.0, *args, **kwargs, ): @@ -161,8 +141,6 @@ def __init__( :param str assign_to_key: key of the input data to assign function applied-output to. (optional) :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'roll' -> column = 'histogram_roll' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. :param args: (tuple, optional): residual args passed on to func_mean and func_std :param kwargs: (dict, optional): residual kwargs passed on to func_mean and func_std """ @@ -188,7 +166,6 @@ def __init__( "hist_name2": hist_col + "_" + suffix, "prefix": suffix, "axis": 1, - "max_res_bound": max_res_bound, } ], ) @@ -207,7 +184,6 @@ def __init__( shift=1, hist_col="histogram", suffix="roll", - max_res_bound=7.0, ): """Initialize an instance of RollingHistComparer. @@ -217,8 +193,6 @@ def __init__( :param int shift: shift of rolling window. default is 1. :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'roll' -> column = 'histogram_roll' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( rolling_hist, @@ -227,7 +201,6 @@ def __init__( read_key, hist_col, suffix, - max_res_bound, window=window, shift=shift, hist_name=hist_col, @@ -251,7 +224,6 @@ def __init__( store_key, hist_col="histogram", suffix="prev1", - max_res_bound=7.0, ): """Initialize an instance of PreviousHistComparer. @@ -259,8 +231,6 @@ def __init__( :param str store_key: key of output data to store in data store :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'prev' -> column = 'histogram_prev' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( read_key, @@ -269,7 +239,6 @@ def __init__( shift=1, hist_col=hist_col, suffix=suffix, - max_res_bound=max_res_bound, ) @@ -283,7 +252,6 @@ def __init__( shift=1, hist_col="histogram", suffix="expanding", - max_res_bound=7.0, ): """Initialize an instance of ExpandingHistComparer. @@ -292,8 +260,6 @@ def __init__( :param int shift: shift of rolling window. default is 1. :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'expanding' -> column = 'histogram_expanding' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( expanding_hist, @@ -302,7 +268,6 @@ def __init__( read_key, hist_col, suffix, - max_res_bound, shift=shift, hist_name=hist_col, ) @@ -325,7 +290,6 @@ def __init__( store_key, hist_col="histogram", suffix="ref", - max_res_bound=7.0, ): """Initialize an instance of ReferenceHistComparer. @@ -334,8 +298,6 @@ def __init__( :param str store_key: key of output data to store in data store :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' :param str suffix: column/key of rolling histogram. default is 'ref' -> column = 'histogram_ref' - :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. - Default is 7.0. """ super().__init__( hist_sum, @@ -344,7 +306,6 @@ def __init__( assign_to_key, hist_col, suffix, - max_res_bound, metrics=[hist_col], ) self.reference_key = reference_key diff --git a/popmon/analysis/profiling/__init__.py b/popmon/analysis/profiling/__init__.py index df5229fa..035ba17c 100644 --- a/popmon/analysis/profiling/__init__.py +++ b/popmon/analysis/profiling/__init__.py @@ -18,6 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import popmon.analysis.profiling.profiles # noqa + from ...analysis.profiling.hist_profiler import HistProfiler from ...analysis.profiling.pull_calculator import ( ExpandingPullCalculator, diff --git a/popmon/analysis/profiling/hist_profiler.py b/popmon/analysis/profiling/hist_profiler.py index 4d5cfb03..10380ae2 100644 --- a/popmon/analysis/profiling/hist_profiler.py +++ b/popmon/analysis/profiling/hist_profiler.py @@ -21,137 +21,10 @@ import numpy as np import pandas as pd -from popmon.analysis.profiling.profiles import Profiles -from popmon.stats import numpy as pm_np +from popmon.analysis.profiling.profile_registry import Profiles -from ...analysis.hist_numpy import get_2dgrid from ...base import Module -from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp, sum_entries - - -@Profiles.register( - key=["min", "max", "p01", "p05", "p16", "p50", "p84", "p95", "p99"], - description=[ - "Minimum value", - "Maximum value", - "1% percentile", - "5% percentile", - "16% percentile", - "50% percentile (median)", - "84% percentile", - "95% percentile", - "99% percentile", - ], - dim=1, - htype="num", -) -def profile_quantiles(x, w): - return pm_np.quantile( - x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w - ) - - -@Profiles.register(key="mean", description="Mean value", dim=1, htype="num") -def profile_mean(x, w): - return pm_np.mean(x, w) - - -@Profiles.register(key="std", description="Standard deviation", dim=1, htype="num") -def profile_std(x, w): - return pm_np.std(x, w) - - -@Profiles.register(key="nan", description="Number of missing entries (NaN)", dim=1) -def profile_nan(hist): - if hasattr(hist, "nanflow"): - return hist.nanflow.entries - elif hasattr(hist, "bins") and "NaN" in hist.bins: - return hist.bins["NaN"].entries - return 0 - - -@Profiles.register( - key="overflow", - description="Number of values larger than the maximum bin-edge of the histogram.", - dim=1, -) -def profile_overflow(hist): - if hasattr(hist, "overflow"): - return hist.overflow.entries - return 0 - - -@Profiles.register( - key="underflow", - description="Number of values smaller than the minimum bin-edge of the histogram.", - dim=1, -) -def profile_underflow(hist): - if hasattr(hist, "underflow"): - return hist.underflow.entries - return 0 - - -@Profiles.register( - key="phik", - description="phi-k correlation between the two variables of the histogram", - dim=2, -) -def profile_phik(hist): - from phik import phik - - # calculate phik correlation - try: - grid = get_2dgrid(hist) - except Exception: - raise - - try: - phi_k = phik.phik_from_hist2d(observed=grid) - except ValueError: - # self.logger.debug( - # f"Not enough values in the 2d `{name}` time-split histogram to apply the phik test." - # ) - phi_k = np.nan - return phi_k - - -@Profiles.register( - key="count", description="Number of entries (non-NaN and NaN)", dim=None -) -def profile_count(hist): - return int(sum_entries(hist)) - - -@Profiles.register( - key="filled", - description="Number of non-missing entries (non-NaN)", - dim=1, - htype="all", -) -def profile_filled(_, bin_counts): - return bin_counts.sum() - - -@Profiles.register( - key="distinct", description="Number of distinct entries", dim=1, htype="all" -) -def profile_distinct(bin_labels, bin_counts): - return len(np.unique(bin_labels[bin_counts > 0])) - - -@Profiles.register( - key="fraction_of_true", description="", dim=1, htype="cat" -) # or type="bool" -def profile_fraction_of_true(bin_labels, bin_counts): - return pm_np.fraction_of_true(bin_labels, bin_counts) - - -@Profiles.register( - key="most_probable_value", description="Most probable value", dim=1, htype="all" -) -def profile_most_probable_value(bin_labels, bin_counts): - return bin_labels[np.argmax(bin_counts)] +from ...hist.hist_utils import get_bin_centers, is_numeric, is_timestamp class HistProfiler(Module): @@ -222,23 +95,29 @@ def _profile_1d_histogram(self, name, hist): # calc 1d-histogram statistics profile = {} - for (key, htype), func in Profiles.get_profiles(dim=1).items(): - if htype is not None and htype != otype and htype != "all": - # skipping; type not applicable - continue - - if htype is None: - args = [hist] - else: - args = [bin_labels, bin_counts] + for key, func in Profiles.get_profiles(dim=1, htype=otype).items(): + args = [bin_labels, bin_counts] + results = func(*args) + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + profile[k] = v + for key, func in Profiles.get_profiles(dim=1, htype="all").items(): + args = [bin_labels, bin_counts] results = func(*args) + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + profile[k] = v - if isinstance(key, (list, tuple)): - for k, v in zip(key, results): - profile[k] = v - else: - profile[key] = results + for key, func in Profiles.get_profiles(dim=1, htype=None).items(): + args = [hist] + results = func(*args) + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + profile[k] = v # postprocessing TS if is_ts: @@ -261,17 +140,13 @@ def _profile_nd_histogram(self, name, hist, dim): # calc nd-histogram statistics profile = {} - for (key, htype), func in Profiles.get_profiles(dim).items(): - if htype is None: - result = func(hist) - else: - raise NotImplementedError("histogram types for nD not implemented") + for key, func in Profiles.get_profiles(dim=dim).items(): + results = func(hist) - if isinstance(key, (list, tuple)): - for k, v in zip(key, result): - profile[k] = v - else: - profile[key] = result + if len(key) == 1: + results = (results,) + for k, v in zip(key, results): + profile[k] = v return profile @@ -286,10 +161,20 @@ def _profile_hist(self, split, hist_name): htype = "num" if is_num else "cat" # these are the profiled quantities we will monitor - if dimension == 1: - expected_fields = Profiles.get_profile_keys(dim=1, htype=htype) - else: - expected_fields = Profiles.get_profile_keys(dim=dimension) + expected_fields = ( + Profiles.get_keys(dim=dimension, htype=htype) + + Profiles.get_keys(dim=dimension, htype="all") + + Profiles.get_keys(dim=dimension, htype=None) + ) + + # profiles regardless of dim and htype (e.g. count) + expected_fields += Profiles.get_keys(dim=None, htype=None) + + # profiles regardless of dim + expected_fields += Profiles.get_keys(dim=-1, htype=htype) + expected_fields += Profiles.get_keys(dim=-1, htype="all") + expected_fields += Profiles.get_keys(dim=-1, htype=None) + expected_fields += [self.index_col, self.hist_col] # now loop over split-axis, e.g. time index, and profile each sub-hist x:y diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py index c72688fa..334618f5 100644 --- a/popmon/analysis/profiling/profiles.py +++ b/popmon/analysis/profiling/profiles.py @@ -16,86 +16,189 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from copy import copy -from typing import Callable, List, Optional, Tuple, Union - - -class Profiles: - _profile_descriptions = {} - _profile_funcs = {-1: {}} - - @classmethod - def register( - cls, - key: Union[str, List[str], Tuple[str]], - description: Union[str, List[str], Tuple[str]], - dim: Optional[int] = None, - htype: Optional[str] = None, - ): - if dim is None: - dim = -1 - if isinstance(key, list): - key = tuple(key) - - if isinstance(description, list): - description = tuple(description) - - def f(func: Callable): - if isinstance(key, tuple): - for k, d in zip(key, description): - cls._profile_descriptions[k] = d - else: - cls._profile_descriptions[key] = description - - if dim not in cls._profile_funcs: - cls._profile_funcs[dim] = {} - cls._profile_funcs[dim][(key, htype)] = func - return func - - return f - - @classmethod - def get_profiles( - cls, - dim: Optional[int] = None, - htype: Optional[str] = None, - ): - def merge(d1, d2): - x = copy(d1) - x.update(d2) - return x - - if dim is None: - v = cls._profile_funcs[-1] - else: - v = merge(cls._profile_funcs.get(dim, {}), cls._profile_funcs[-1]) - - return v - - @classmethod - def get_profile_keys( - cls, - dim: Optional[int] = None, - htype: Optional[str] = None, - ): - def flatten(input_list): - vals = [] - for v in input_list: - if isinstance(v, (list, tuple)): - for v2 in v: - vals.append(v2) - else: - vals.append(v) - return vals - - return flatten( - [ - k - for (k, dtype), v in cls.get_profiles(dim).items() - if dtype is None or htype is None or dtype == "all" or htype == dtype - ] + + +import numpy as np + +from ...analysis.hist_numpy import get_2dgrid +from ...analysis.profiling.profile_registry import Profiles +from ...hist.hist_utils import sum_entries +from ...stats import numpy as pm_np + + +@Profiles.register( + key=["min", "max", "p01", "p05", "p16", "p50", "p84", "p95", "p99"], + description=[ + "Minimum value", + "Maximum value", + "1% percentile", + "5% percentile", + "16% percentile", + "50% percentile (median)", + "84% percentile", + "95% percentile", + "99% percentile", + ], + dim=1, + htype="num", +) +def profile_quantiles(x, w): + return pm_np.quantile( + x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w + ) + + +@Profiles.register(key="mean", description="Mean value", dim=1, htype="num") +def profile_mean(x, w): + return pm_np.mean(x, w) + + +@Profiles.register(key="std", description="Standard deviation", dim=1, htype="num") +def profile_std(x, w): + return pm_np.std(x, w) + + +@Profiles.register(key="nan", description="Number of missing entries (NaN)", dim=1) +def profile_nan(hist): + if hasattr(hist, "nanflow"): + return hist.nanflow.entries + elif hasattr(hist, "bins") and "NaN" in hist.bins: + return hist.bins["NaN"].entries + return 0 + + +@Profiles.register( + key="overflow", + description="Number of values larger than the maximum bin-edge of the histogram.", + dim=1, +) +def profile_overflow(hist): + if hasattr(hist, "overflow"): + return hist.overflow.entries + return 0 + + +@Profiles.register( + key="underflow", + description="Number of values smaller than the minimum bin-edge of the histogram.", + dim=1, +) +def profile_underflow(hist): + if hasattr(hist, "underflow"): + return hist.underflow.entries + return 0 + + +@Profiles.register( + key="phik", + description="phi-k correlation between the two variables of the histogram", + dim=2, +) +def profile_phik(hist): + from phik import phik + + # calculate phik correlation + try: + grid = get_2dgrid(hist) + except Exception: + raise + + try: + phi_k = phik.phik_from_hist2d(observed=grid) + except ValueError: + # self.logger.debug( + # f"Not enough values in the 2d `{name}` time-split histogram to apply the phik test." + # ) + phi_k = np.nan + return phi_k + + +@Profiles.register( + key="count", description="Number of entries (non-NaN and NaN)", dim=None +) +def profile_count(hist): + return int(sum_entries(hist)) + + +@Profiles.register( + key="filled", + description="Number of non-missing entries (non-NaN)", + dim=1, + htype="all", +) +def profile_filled(_, bin_counts): + return bin_counts.sum() + + +@Profiles.register( + key="distinct", description="Number of distinct entries", dim=1, htype="all" +) +def profile_distinct(bin_labels, bin_counts): + return len(np.unique(bin_labels[bin_counts > 0])) + + +def fraction_of_true(bin_labels, bin_entries): + """Compute fraction of 'true' labels + + :param bin_labels: Array containing numbers whose mean is desired. If `a` is not an + array, a conversion is attempted. + :param bin_entries: Array containing weights for the elements of `a`. If `weights` is not an + array, a conversion is attempted. + :return: fraction of 'true' labels + """ + bin_labels = np.array(bin_labels) + bin_entries = np.array(bin_entries) + assert len(bin_labels) == len(bin_entries) + + def replace(bl): + if bl in {"True", "true"}: + return True + elif bl in {"False", "false"}: + return False + return np.nan + + # basic checks: dealing with boolean labels + # also accept strings of 'True' and 'False' + if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0: + return np.nan + if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]): + if not np.all( + [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels] + ): + return np.nan + # all strings from hereon + n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum() + n_false = (bin_labels == "False").sum() + (bin_labels == "false").sum() + n_nan = ( + (bin_labels == "NaN").sum() + + (bin_labels == "nan").sum() + + (bin_labels == "None").sum() + + (bin_labels == "none").sum() + + (bin_labels == "Null").sum() + + (bin_labels == "null").sum() ) + if n_true + n_false + n_nan != len(bin_labels): + return np.nan + # convert string to boolean + bin_labels = np.array([replace(bl) for bl in bin_labels]) + + sum_true = np.sum([be for bl, be in zip(bin_labels, bin_entries) if bl]) + sum_false = np.sum([be for bl, be in zip(bin_labels, bin_entries) if not bl]) + sum_entries = sum_true + sum_false + if sum_entries == 0: + # all nans scenario + return np.nan + # exclude nans from fraction + return (1.0 * sum_true) / sum_entries + + +@Profiles.register(key="fraction_of_true", description="", dim=1, htype="cat") +def profile_fraction_of_true(bin_labels, bin_counts): + return fraction_of_true(bin_labels, bin_counts) + - @classmethod - def get_descriptions(cls): - return cls._profile_descriptions +@Profiles.register( + key="most_probable_value", description="Most probable value", dim=1, htype="all" +) +def profile_most_probable_value(bin_labels, bin_counts): + return bin_labels[np.argmax(bin_counts)] diff --git a/popmon/config.py b/popmon/config.py index aeb46ef5..0c118456 100644 --- a/popmon/config.py +++ b/popmon/config.py @@ -17,10 +17,9 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from pathlib import Path -from typing import Literal, Optional, Union +from typing import Dict, List, Optional, Union from pydantic import BaseModel, BaseSettings -from pydantic.fields import Field # Global configuration for the joblib parallelization. Could be used to change the number of jobs, and/or change # the backend from default (loki) to 'multiprocessing' or 'threading'. @@ -53,9 +52,7 @@ class HistogramSectionModel(BaseModel): name = "Histograms" description = "Histograms of the last few time slots (default: 2)." - hist_names: list[ - Literal["heatmap", "heatmap_column_normalized", "heatmap_row_normalized"] - ] = [ + hist_names: List[str] = [ "heatmap", "heatmap_column_normalized", "heatmap_row_normalized", @@ -72,7 +69,14 @@ class HistogramSectionModel(BaseModel): "heatmap_column_normalized": "The column-normalized heatmap allows for comparing of time bins when the counts in each bin vary.", "heatmap_row_normalized": "The row-normalized heatmaps allows for monitoring one value over time.", } + + """ + plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) + """ plot_hist_n: int = 2 + """ + cmap: colormap for histogram heatmaps + """ cmap: str = "autumn_r" @@ -102,12 +106,44 @@ class Section(BaseModel): traffic_lights: TrafficLightsSection = TrafficLightsSection() -def get_stats(): - from popmon.analysis.comparison.comparisons import Comparisons +class Report(BaseModel): + """Report-specific configuration""" - comparisons = Comparisons.get_descriptions() + """ + skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) + """ + skip_empty_plots: bool = True - stats = [ + """ + last_n: plot statistic data for last 'n' periods (optional) + """ + last_n: int = 0 + + """ + skip_first_n: in plot skip first 'n' periods. last_n takes precedence (optional) + """ + skip_first_n: int = 0 + + """ + skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) + """ + skip_last_n: int = 0 + + """ + report_filepath: the file path where to output the report (optional) + """ + report_filepath: Optional[Union[str, Path]] = None + + """ + if True, show all the generated statistics in the report (optional) + if set to False, then smaller show_stats (see below) + """ + extended_report: bool = True + + """ + show_stats: list of statistic name patterns to show in the report. If None, show all (optional) + """ + show_stats: List[str] = [ "distinct*", "filled*", "nan*", @@ -124,68 +160,86 @@ def get_stats(): "*chi2_norm*", "*zscore*", "n_*", + "*jsd*", + "*psi*", + "*max_prob_diff*", ] - for key in comparisons.keys(): - stats.append(f"*{key}*") - - return stats - - -class Report(BaseModel): - """Report-specific configuration""" + """ + top_n: limit of number of categorical items to plot (default: 20) + """ + top_n: int = 20 - skip_empty_plots: bool = True - last_n: int = 0 - skip_first_n: int = 0 - skip_last_n: int = 0 - report_filepath: Optional[Union[str, Path]] = None - # if set to false, then smaller show_stats - # if limited report is selected, check if stats list is provided, if not, get a default minimal list - # show_stats = show_stats if not extended_report else None - extended_report: bool = True - show_stats: list[str] = Field(default_factory=get_stats) section: Section = Section() - top_n: int = 20 class Comparison(BaseModel): - window = 10 - shift = 1 + """ + window: size of rolling window and/or trend detection. default is 10. + """ + + window: int = 10 + """ + shift: shift of time-bins in rolling/expanding window. default is 1. + """ + shift: int = 1 class Monitoring(BaseModel): - monitoring_rules: dict[str, list[float]] = { + """ + monitoring_rules: monitoring rules to generate traffic light alerts. + The default setting is: + + .. code-block:: python + + monitoring_rules = { + "*_pull": [7, 4, -4, -7], + "*_zscore": [7, 4, -4, -7], + "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], + } + + Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. + For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". + You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the + feature name in front. E.g. + + .. code-block:: python + + monitoring_rules = { + "featureA:*_pull": [5, 3, -3, -5], + "featureA:nan": [4, 1, 0, 0], + "*_pull": [7, 4, -4, -7], + "nan": [8, 1, 0, 0], + } + + In case of multiple rules could apply for a feature's statistic, the most specific one applies. + So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule + for all other features. + """ + + monitoring_rules: Dict[str, List[Union[float, int]]] = { "*_pull": [7, 4, -4, -7], "*_zscore": [7, 4, -4, -7], "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], } - pull_rules: dict[str, list[float]] = {"*_pull": [7, 4, -4, -7]} + + """ + pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. + Default is: + + .. code-block:: python + + pull_rules = {"*_pull": [7, 4, -4, -7]} + + This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, + and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. + Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. + (The same string logic applies as for monitoring_rules.) + """ + pull_rules: Dict[str, List[Union[float, int]]] = {"*_pull": [7, 4, -4, -7]} class Settings(BaseSettings): report: Report = Report() comparison: Comparison = Comparison() monitoring: Monitoring = Monitoring() - - @classmethod - def get_keys(cls): - aliases = {} - ambiguous = [] - for key, value in cls.schema()["properties"].items(): - if key in aliases: - ambiguous.append(key) - del aliases[key] - elif key in ambiguous: - continue - - if "allOf" in value: - for skey, svalue in value["default"].items(): - if skey in aliases: - ambiguous.append(key) - del aliases[key] - else: - aliases[skey] = (key, skey) - else: - aliases[key] = key - return aliases diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index 54333683..b1122d0f 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -228,7 +228,7 @@ def __init__( :param kwargs: residual keyword arguments :return: assembled self reference pipeline """ - from popmon.analysis.comparison.comparisons import Comparisons + from popmon.analysis.comparison.comparison_registry import Comparisons reference_prefix = "ref" reference_modules: List[Union[Module, Pipeline]] = [ @@ -292,7 +292,7 @@ def __init__( :param kwargs: residual keyword arguments :return: assembled external reference pipeline """ - from popmon.analysis.comparison.comparisons import Comparisons + from popmon.analysis.comparison.comparison_registry import Comparisons reference_prefix = "ref" reference_modules: List[Union[Module, Pipeline]] = [ @@ -360,7 +360,7 @@ def __init__( :param kwargs: residual keyword arguments :return: assembled rolling reference pipeline """ - from popmon.analysis.comparison.comparisons import Comparisons + from popmon.analysis.comparison.comparison_registry import Comparisons reference_prefix = "roll" reference_modules: List[Union[Module, Pipeline]] = [ @@ -426,7 +426,7 @@ def __init__( :param kwargs: residual keyword arguments :return: assembled expanding reference pipeline """ - from popmon.analysis.comparison.comparisons import Comparisons + from popmon.analysis.comparison.comparison_registry import Comparisons reference_prefix = "expanding" reference_modules: List[Union[Module, Pipeline]] = [ diff --git a/popmon/stats/numpy.py b/popmon/stats/numpy.py index 6494d5a9..16c81529 100644 --- a/popmon/stats/numpy.py +++ b/popmon/stats/numpy.py @@ -18,68 +18,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import warnings - import numpy as np import pandas as pd -from scipy import stats - -from popmon.analysis.comparison.comparisons import Comparisons - - -def fraction_of_true(bin_labels, bin_entries): - """Compute fraction of 'true' labels - - :param bin_labels: Array containing numbers whose mean is desired. If `a` is not an - array, a conversion is attempted. - :param bin_entries: Array containing weights for the elements of `a`. If `weights` is not an - array, a conversion is attempted. - :return: fraction of 'true' labels - """ - bin_labels = np.array(bin_labels) - bin_entries = np.array(bin_entries) - assert len(bin_labels) == len(bin_entries) - - def replace(bl): - if bl in {"True", "true"}: - return True - elif bl in {"False", "false"}: - return False - return np.nan - - # basic checks: dealing with boolean labels - # also accept strings of 'True' and 'False' - if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0: - return np.nan - if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]): - if not np.all( - [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels] - ): - return np.nan - # all strings from hereon - n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum() - n_false = (bin_labels == "False").sum() + (bin_labels == "false").sum() - n_nan = ( - (bin_labels == "NaN").sum() - + (bin_labels == "nan").sum() - + (bin_labels == "None").sum() - + (bin_labels == "none").sum() - + (bin_labels == "Null").sum() - + (bin_labels == "null").sum() - ) - if n_true + n_false + n_nan != len(bin_labels): - return np.nan - # convert string to boolean - bin_labels = np.array([replace(bl) for bl in bin_labels]) - - sum_true = np.sum([be for bl, be in zip(bin_labels, bin_entries) if bl]) - sum_false = np.sum([be for bl, be in zip(bin_labels, bin_entries) if not bl]) - sum_entries = sum_true + sum_false - if sum_entries == 0: - # all nans scenario - return np.nan - # exclude nans from fraction - return (1.0 * sum_true) / sum_entries def mean(a, weights=None, axis=None, dtype=None, keepdims=False, ddof=0): @@ -217,179 +157,6 @@ def quantile(a, q, weights=None, axis=None, keepdims: bool = False): return y -def _not_finite_to_zero(x): - res = x.copy() - res[~np.isfinite(res)] = 0 - return res - - -def uu_chi2(n, m, verbose=False): - """Normalized Chi^2 formula for two histograms with different number of entries - - Copyright ROOT: - Formulas translated from c++ to python, but formulas otherwise not modified. - Reference: https://root.cern.ch/doc/master/classTH1.html#a6c281eebc0c0a848e7a0d620425090a5 - GNU License: https://root.cern.ch/license - All modifications copyright ING WBAA. - - :param n: 1d array with bin counts of the reference set - :param m: 1d array with bin counts of the test set - :param bool verbose: if true, print warnings in case of empty histograms - :return: tuple of floats (chi2_value, chi2_norm, z_score, p_value, res) - """ - if len(n) == 0 or len(m) == 0: - raise ValueError("Input histogram(s) has zero size.") - if len(n) != len(m): - raise ValueError("Input histograms have unequal size.") - - N = np.sum(n) - M = np.sum(m) - - if N == 0 or M == 0: - if verbose: - warnings.warn( - "Input histogram(s) is empty and cannot be renormalized. Chi2 is undefined." - ) - return np.nan, np.nan, np.nan, np.nan, [0] * len(n) - - # remove all zero entries in the sum, to present division by zero for individual bins - z = n + m - n = n[z != 0] - m = m[z != 0] - - dof = ((n != 0) | (m != 0)).sum() - 1 - chi2_value = _not_finite_to_zero(((M * n - N * m) ** 2) / (n + m)).sum() / M / N - - chi2_norm = chi2_value / dof if dof > 0 else np.nan - p_value = stats.chi2.sf(chi2_value, dof) - z_score = -stats.norm.ppf(p_value) - - p = (n + m) / (N + M) - - if (p == 1).any(): - # unusual case of (only) one bin with p==1, avoids division with zero below - res = np.array([np.nan] * len(p)) - else: - res = _not_finite_to_zero( - (n - N * p) / np.sqrt(N * p) / np.sqrt((1 - N / (N + M)) * (1 - p)) - ) - - return chi2_value, chi2_norm, z_score, p_value, res - - -def ks_test(hist_1, hist_2): - """KS-test for two histograms with different number of entries - - Copyright ROOT: - Formulas translated from c++ to python, but formulas otherwise not modified. - Reference: link: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest - GNU license: https://root.cern.ch/license - All modifications copyright ING WBAA. - - :param hist_1: 1D array with bin counts of the histogram_1 - :param hist_2: 1D array with bin counts of the histogram_2 - - :return: ks_score: Kolmogorov-Smirnov Test score - :rtype: float - """ - if len(hist_1) == 0 or len(hist_2) == 0: - raise ValueError("Input histogram(s) has zero size.") - if len(hist_1) != len(hist_2): - raise ValueError("Input histograms have unequal size.") - - sum_1 = np.sum(hist_1) - sum_2 = np.sum(hist_2) - if sum_1 == 0 or sum_2 == 0: - return np.nan - - normalized_cumsum_1 = np.cumsum(hist_1) / sum_1 - normalized_cumsum_2 = np.cumsum(hist_2) / sum_2 - - d = np.abs(normalized_cumsum_1 - normalized_cumsum_2) - - return np.max(d) * np.sqrt(sum_1 * sum_2 / (sum_1 + sum_2)) - - -def ks_prob(testscore): - """KS-probability corresponding ti KS test score - - Copyright ROOT: - Formulas translated from c++ to python, but formulas otherwise not modified. - Reference: https://root.cern.ch/doc/master/classTH1.html#TH1:KolmogorovTest - GNU license: https://root.cern.ch/license - All modifications copyright ING WBAA. - - :param float testscore: Kolmogorov-Smirnov test score - - :return: approximate pvalue for the Kolmogorov-Smirnov test score - :rtype: float - """ - fj = np.array([-2, -8, -18, -32]) - r = np.zeros(4) - - w = 2.50662827 - c = np.array([-1.2337005501361697, -11.103304951225528, -30.842513753404244]) - - u = abs(testscore) - pvalue = np.nan - if u < 0.2: - pvalue = 1 - elif u < 0.755: - v = np.power(u, -2) - pvalue = 1 - w * np.exp(c * v).sum() / u - elif u < 6.8116: - v = np.power(u, 2) - max_j = int(max(1, round(3.0 / u))) - r[:max_j] = np.exp(fj[:max_j] * v) - pvalue = 2 * (r[0] - r[1] + r[2] - r[3]) - - return pvalue - - -@Comparisons.register( - key="max_prob_diff", - description="The largest absolute difference between all bin pairs of two normalized histograms (one histogram in a time slot and one in {ref})", -) -def googl_test(bins_1, bins_2): - """Google-paper test - - Reference link: https://mlsys.org/Conferences/2019/doc/2019/167.pdf - - :param bins_1: first array of bin entries - :param bins_2: second array of entries - - :return: maximum difference between the two entry distributions - :rtype: float - """ - - def dist(bins): - sum_ = np.sum(bins) - return bins / sum_ if sum_ else bins - - return np.max(np.abs(dist(bins_1) - dist(bins_2))) - - -@Comparisons.register(key="psi", description="Population Stability Index") -def population_stability_index(p, q): - epsilon = 10e-6 - p += epsilon - q += epsilon - return np.sum((p - q) * np.log(p / q)) - - -def kullback_leibler_divergence(p, q): - epsilon = 10e-6 - p += epsilon - q += epsilon - return np.sum(p * np.log(p / q)) - - -@Comparisons.register(key="jsd", description="Jensen-Shannon Divergence") -def jensen_shannon_divergence(p, q): - m = 0.5 * (p + q) - return 0.5 * (kullback_leibler_divergence(p, m) + kullback_leibler_divergence(q, m)) - - def probability_distribution_mean_covariance(entries_list): """Mean normalized histogram and covariance of list of input histograms diff --git a/popmon/visualization/alert_section_generator.py b/popmon/visualization/alert_section_generator.py index 18262562..b0b0a122 100644 --- a/popmon/visualization/alert_section_generator.py +++ b/popmon/visualization/alert_section_generator.py @@ -79,7 +79,7 @@ def __init__( self.skip_first_n = settings.skip_first_n self.skip_last_n = settings.skip_last_n self.skip_empty_plots = settings.skip_empty_plots - self.show_stats = settings.show_stats + self.show_stats = settings.show_stats if not settings.extended_report else None self.section_name = settings.section.alerts.name self.description = settings.section.alerts.description diff --git a/popmon/visualization/overview_section.py b/popmon/visualization/overview_section.py index a01ec471..46c6e31a 100644 --- a/popmon/visualization/overview_section.py +++ b/popmon/visualization/overview_section.py @@ -89,7 +89,7 @@ def __init__( self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] self.skip_empty_plots = settings.skip_empty_plots - self.show_stats = settings.show_stats + self.show_stats = settings.show_stats if not settings.extended_report else None self.section_name = settings.section.overview.name self.description = settings.section.overview.description diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index 5422b37c..71ce72f9 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -24,8 +24,8 @@ import pandas as pd from tqdm import tqdm -from popmon.analysis.comparison.comparisons import Comparisons -from popmon.analysis.profiling.profiles import Profiles +from popmon.analysis.comparison.comparison_registry import Comparisons +from popmon.analysis.profiling.profile_registry import Profiles from ..base import Module from ..config import Report @@ -122,7 +122,7 @@ def __init__( self.ignore_stat_endswith = ignore_stat_endswith or [] self.skip_empty_plots = settings.skip_empty_plots self.description = description - self.show_stats = settings.show_stats + self.show_stats = settings.show_stats if not settings.extended_report else None def get_description(self): return self.section_name diff --git a/popmon/visualization/traffic_light_section_generator.py b/popmon/visualization/traffic_light_section_generator.py index fad0647f..21408895 100644 --- a/popmon/visualization/traffic_light_section_generator.py +++ b/popmon/visualization/traffic_light_section_generator.py @@ -83,7 +83,7 @@ def __init__( self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] self.skip_empty_plots = settings.skip_empty_plots - self.show_stats = settings.show_stats + self.show_stats = settings.show_stats if not settings.extended_report else None self.section_name = settings.section.traffic_lights.name self.description = settings.section.traffic_lights.description