From 4734931d34f30741ed7854a5007d3c2f677d493e Mon Sep 17 00:00:00 2001 From: Nikolaos Perrakis <89025229+nikml@users.noreply.github.com> Date: Wed, 12 Jul 2023 22:14:27 +0300 Subject: [PATCH] Library Updates (#318) * make threshold limits work for edge cases * update calculator parameters docs --- docs/tutorials/data_quality/missing.rst | 22 +++++++---- docs/tutorials/data_quality/unseen.rst | 22 +++++++---- .../multivariate_drift_detection.rst | 39 +++++++++++++------ .../univariate_drift_detection.rst | 34 ++++++++++++---- docs/tutorials/summary_stats/avg.rst | 20 +++++++--- docs/tutorials/summary_stats/count.rst | 18 ++++++--- docs/tutorials/summary_stats/median.rst | 20 +++++++--- docs/tutorials/summary_stats/std.rst | 20 +++++++--- docs/tutorials/summary_stats/sum.rst | 20 +++++++--- nannyml/thresholds.py | 4 +- 10 files changed, 155 insertions(+), 64 deletions(-) diff --git a/docs/tutorials/data_quality/missing.rst b/docs/tutorials/data_quality/missing.rst index 586e33cc..7cbf608a 100644 --- a/docs/tutorials/data_quality/missing.rst +++ b/docs/tutorials/data_quality/missing.rst @@ -38,14 +38,22 @@ The :class:`~nannyml.data_quality.missing.calculator.MissingValuesCalculator` cl the functionality needed for missing values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, a boolean option indicating whether we want the absolute count of the missing +- **column_names:** A list with the names of columns to be evaluated. +- **normalize (Optional):** Optionally, a boolean option indicating whether we want the absolute count of the missing value instances or their relative ratio. By default it is set to true. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **thresholds (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Missing Values.ipynb diff --git a/docs/tutorials/data_quality/unseen.rst b/docs/tutorials/data_quality/unseen.rst index de9fd104..a5127bf3 100644 --- a/docs/tutorials/data_quality/unseen.rst +++ b/docs/tutorials/data_quality/unseen.rst @@ -40,14 +40,22 @@ The :class:`~nannyml.data_quality.unseen.calculator.UnseenValuesCalculator` clas the functionality needed for unseen values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. They need to be categorical columns. -- Optionally, a boolean option indicating whether we want the absolute count of the unseen +- **column_names:** A list with the names of columns to be evaluated. They need to be categorical columns. +- **normalize (Optional):** Optionally, a boolean option indicating whether we want the absolute count of the missing value instances or their relative ratio. By default it is set to true. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **thresholds (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. warning:: diff --git a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst index 39d28cff..19f40366 100644 --- a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst +++ b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst @@ -50,12 +50,30 @@ Let's start by loading some synthetic data provided by the NannyML package and s :cell: 2 The :class:`~nannyml.drift.multivariate.data_reconstruction.calculator.DataReconstructionDriftCalculator` -module implements this functionality. We need to instantiate it with appropriate parameters - the column names of the features we want to run drift detection on, -and the timestamp column name. The features can be passed in as a simple list of strings. Alternatively, we can create a list by excluding the columns in the dataframe that are not features, -and pass them into the argument. - -Next, the :meth:`~nannyml.base.AbstractCalculator.fit` method needs to be called on the reference data, which the results will be based on. -Then the +module implements this functionality. We need to instantiate it with appropriate parameters: + +- **column_names:** A list with the column names of the features we want to run drift detection on. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **n_components (Optional):** The n_components parameter as passed to the sklearn `PCA constructor`_. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **imputer_categorical (Optional):** An sklearn `SimpleImputer`_ object specifying an appropriate strategy + for imputing missing values for categorical features. +- **imputer_continuous (Optional):** An sklearn `SimpleImputer`_ object specifying an appropriate strategy + for imputing missing values for continuous features. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. + +Next, the :meth:`~nannyml.base.AbstractCalculator.fit` method needs to be called on the reference data, +which the results will be based on. Then the :meth:`~nannyml.base.AbstractCalculator.calculate` method will calculate the multivariate drift results on the provided data. @@ -101,11 +119,8 @@ NannyML can also visualize the multivariate drift results in a plot. Our plot co * The purple step plot shows the reconstruction error in each chunk of the analysis period. Thick squared point markers indicate the middle of these chunks. - * The low-saturated purple area around the reconstruction error indicates the :ref:`sampling error`. - * The red horizontal dashed lines show upper and lower thresholds for alerting purposes. - * If the reconstruction error crosses the upper or lower threshold an alert is raised which is indicated with a red, low-saturated background across the whole width of the relevant chunk. A red, diamond-shaped point marker additionally indicates this in the middle of the chunk. @@ -118,9 +133,6 @@ NannyML can also visualize the multivariate drift results in a plot. Our plot co The multivariate drift results provide a concise summary of where data drift is happening in our input data. -.. _SimpleImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html - - Insights -------- @@ -137,3 +149,6 @@ estimate the impact of the observed changes. For more information on how multivariate drift detection works, the :ref:`Data Reconstruction with PCA` explanation page gives more details. + +.. _`PCA constructor`: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html +.. _`SimpleImputer`: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html diff --git a/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst b/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst index c210433e..fd1e3a4b 100644 --- a/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst +++ b/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst @@ -49,14 +49,34 @@ We begin by loading some synthetic data provided in the NannyML package. This is The :class:`~nannyml.drift.univariate.calculator.UnivariateDriftCalculator` class implements the functionality needed for univariate drift detection. First, we need to instantiate it with the appropriate parameters: -- The names of the columns to be evaluated. -- A list of methods to use on continuous columns. You can chose from :ref:`kolmogorov_smirnov`, - :ref:`jensen_shannon`, :ref:`wasserstein` - and :ref:`hellinger`. -- A list of methods to use on categorical columns. You can choose from :ref:`chi2`, :ref:`jensen_shannon`, +- **column_names:** A list with the names of columns to be evaluated. +- **treat_as_categorical (Optional):** A list of column names to treat as categorical columns. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **categorical_methods (Optional):** A list of methods to use on categorical columns. + You can choose from :ref:`chi2`, :ref:`jensen_shannon`, :ref:`l_infinity`, and :ref:`hellinger`. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default chunker creating 10 chunks will be used. +- **continuous_methods (Optional):** A list of methods to use on continuous columns. + You can chose from :ref:`kolmogorov_smirnov`, + :ref:`jensen_shannon`, + :ref:`wasserstein` + and :ref:`hellinger`. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **thresholds (Optional):** A dictionary allowing users to set a custom threshold strategy for each method. + It links a `Threshold` subclass to a method name. + For more information about thresholds, check out the :ref:`thresholds tutorial`. +- **computation_params (Optional):** A dictionary which allows users to specify whether they want drift calculated on + the exact reference data or an estimated distribution of the reference data obtained + using binning techniques. Applicable only to Kolmogorov-Smirnov and Wasserstein. For more information look + :class:`~nannyml.drift.univariate.calculator.UnivariateDriftCalculator`. .. nbimport:: :path: ./example_notebooks/Tutorial - Drift - Univariate.ipynb diff --git a/docs/tutorials/summary_stats/avg.rst b/docs/tutorials/summary_stats/avg.rst index 5f4882fa..88874f14 100644 --- a/docs/tutorials/summary_stats/avg.rst +++ b/docs/tutorials/summary_stats/avg.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.avg.calculator.SummaryStatsAvgCalculator` class imple the functionality needed for mean values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Avg.ipynb diff --git a/docs/tutorials/summary_stats/count.rst b/docs/tutorials/summary_stats/count.rst index ae2194c2..b3e22fbd 100644 --- a/docs/tutorials/summary_stats/count.rst +++ b/docs/tutorials/summary_stats/count.rst @@ -34,11 +34,19 @@ The :class:`~nannyml.stats.count.calculator.SummaryStatsRowCountCalculator` clas the functionality needed for row count calculations. We need to instantiate it with appropriate *optional* parameters: -- The name of the column containing the observation timestamps. -- A chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- A threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Count.ipynb diff --git a/docs/tutorials/summary_stats/median.rst b/docs/tutorials/summary_stats/median.rst index d13fd454..c9d96d48 100644 --- a/docs/tutorials/summary_stats/median.rst +++ b/docs/tutorials/summary_stats/median.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.avg.calculator.SummaryStatsMedianCalculator` class im the functionality needed for median values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Median.ipynb diff --git a/docs/tutorials/summary_stats/std.rst b/docs/tutorials/summary_stats/std.rst index 849a8e41..142bd8e1 100644 --- a/docs/tutorials/summary_stats/std.rst +++ b/docs/tutorials/summary_stats/std.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.std.calculator.SummaryStatsStdCalculator` class imple the functionality needed for standard deviation values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Std.ipynb diff --git a/docs/tutorials/summary_stats/sum.rst b/docs/tutorials/summary_stats/sum.rst index da2c4179..634d9062 100644 --- a/docs/tutorials/summary_stats/sum.rst +++ b/docs/tutorials/summary_stats/sum.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.sum.calculator.SummaryStatsSumCalculator` class imple the functionality needed for sum values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Sum.ipynb diff --git a/nannyml/thresholds.py b/nannyml/thresholds.py index 667b7de6..b39e7bff 100644 --- a/nannyml/thresholds.py +++ b/nannyml/thresholds.py @@ -264,7 +264,7 @@ def calculate_threshold_values( if ( lower_threshold_value_limit is not None and lower_threshold_value is not None - and lower_threshold_value < lower_threshold_value_limit + and lower_threshold_value <= lower_threshold_value_limit ): override_value = None if override_using_none else lower_threshold_value_limit if logger: @@ -277,7 +277,7 @@ def calculate_threshold_values( if ( upper_threshold_value_limit is not None and upper_threshold_value is not None - and upper_threshold_value > upper_threshold_value_limit + and upper_threshold_value >= upper_threshold_value_limit ): override_value = None if override_using_none else upper_threshold_value_limit if logger: