From 4734931d34f30741ed7854a5007d3c2f677d493e Mon Sep 17 00:00:00 2001
From: Nikolaos Perrakis <89025229+nikml@users.noreply.github.com>
Date: Wed, 12 Jul 2023 22:14:27 +0300
Subject: [PATCH] Library Updates (#318)

* make threshold limits work for edge cases

* update calculator parameters docs
---
 docs/tutorials/data_quality/missing.rst       | 22 +++++++----
 docs/tutorials/data_quality/unseen.rst        | 22 +++++++----
 .../multivariate_drift_detection.rst          | 39 +++++++++++++------
 .../univariate_drift_detection.rst            | 34 ++++++++++++----
 docs/tutorials/summary_stats/avg.rst          | 20 +++++++---
 docs/tutorials/summary_stats/count.rst        | 18 ++++++---
 docs/tutorials/summary_stats/median.rst       | 20 +++++++---
 docs/tutorials/summary_stats/std.rst          | 20 +++++++---
 docs/tutorials/summary_stats/sum.rst          | 20 +++++++---
 nannyml/thresholds.py                         |  4 +-
 10 files changed, 155 insertions(+), 64 deletions(-)
diff --git a/docs/tutorials/data_quality/missing.rst b/docs/tutorials/data_quality/missing.rst
index 586e33cc..7cbf608a 100644
--- a/docs/tutorials/data_quality/missing.rst
+++ b/docs/tutorials/data_quality/missing.rst
@@ -38,14 +38,22 @@ The :class:`~nannyml.data_quality.missing.calculator.MissingValuesCalculator` cl
 the functionality needed for missing values calculations.
 We need to instantiate it with appropriate parameters:
 
-- The names of the columns to be evaluated.
-- Optionally, a boolean option indicating whether we want the absolute count of the missing
+- **column_names:** A list with the names of columns to be evaluated.
+- **normalize (Optional):** Optionally, a boolean option indicating whether we want the absolute count of the missing
   value instances or their relative ratio. By default it is set to true.
-- Optionally, the name of the column containing the observation timestamps.
-- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default
-  chunker creating 10 chunks will be used.
-- Optionally, a threshold strategy to modify the default one. See available threshold options
-  :ref:`here<thresholds>`.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **thresholds (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
 
 .. nbimport::
     :path: ./example_notebooks/Tutorial - Missing Values.ipynb
diff --git a/docs/tutorials/data_quality/unseen.rst b/docs/tutorials/data_quality/unseen.rst
index de9fd104..a5127bf3 100644
--- a/docs/tutorials/data_quality/unseen.rst
+++ b/docs/tutorials/data_quality/unseen.rst
@@ -40,14 +40,22 @@ The :class:`~nannyml.data_quality.unseen.calculator.UnseenValuesCalculator` clas
 the functionality needed for unseen values calculations.
 We need to instantiate it with appropriate parameters:
 
-- The names of the columns to be evaluated. They need to be categorical columns.
-- Optionally, a boolean option indicating whether we want the absolute count of the unseen
+- **column_names:** A list with the names of columns to be evaluated. They need to be categorical columns.
+- **normalize (Optional):** Optionally, a boolean option indicating whether we want the absolute count of the missing
   value instances or their relative ratio. By default it is set to true.
-- Optionally, the name of the column containing the observation timestamps.
-- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default
-  chunker creating 10 chunks will be used.
-- Optionally, a threshold strategy to modify the default one. See available threshold options
-  :ref:`here<thresholds>`.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **thresholds (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
 
 .. warning::
 
diff --git a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst
index 39d28cff..19f40366 100644
--- a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst
+++ b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst
@@ -50,12 +50,30 @@ Let's start by loading some synthetic data provided by the NannyML package and s
     :cell: 2
 
 The :class:`~nannyml.drift.multivariate.data_reconstruction.calculator.DataReconstructionDriftCalculator`
-module implements this functionality.  We need to instantiate it with appropriate parameters - the column names of the features we want to run drift detection on,
-and the timestamp column name. The features can be passed in as a simple list of strings. Alternatively, we can create a list by excluding the columns in the dataframe that are not features,
-and pass them into the argument.
-
-Next, the :meth:`~nannyml.base.AbstractCalculator.fit` method needs to be called on the reference data, which the results will be based on.
-Then the
+module implements this functionality. We need to instantiate it with appropriate parameters:
+
+- **column_names:** A list with the column names of the features we want to run drift detection on.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **n_components (Optional):** The n_components parameter as passed to the sklearn `PCA constructor`_.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **imputer_categorical (Optional):** An sklearn `SimpleImputer`_ object specifying an appropriate strategy
+  for imputing missing values for categorical features.
+- **imputer_continuous (Optional):** An sklearn `SimpleImputer`_ object specifying an appropriate strategy
+  for imputing missing values for continuous features.
+- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
+
+Next, the :meth:`~nannyml.base.AbstractCalculator.fit` method needs to be called on the reference data,
+which the results will be based on. Then the
 :meth:`~nannyml.base.AbstractCalculator.calculate` method will
 calculate the multivariate drift results on the provided data.
 
@@ -101,11 +119,8 @@ NannyML can also visualize the multivariate drift results in a plot. Our plot co
 
 * The purple step plot shows the reconstruction error in each chunk of the analysis period. Thick squared point
   markers indicate the middle of these chunks.
-
 * The low-saturated purple area around the reconstruction error indicates the :ref:`sampling error<estimation_of_standard_error>`.
-
 * The red horizontal dashed lines show upper and lower thresholds for alerting purposes.
-
 * If the reconstruction error crosses the upper or lower threshold an alert is raised which is indicated with a red,
   low-saturated background across the whole width of the relevant chunk. A red, diamond-shaped point marker additionally indicates this in the middle of the chunk.
 
@@ -118,9 +133,6 @@ NannyML can also visualize the multivariate drift results in a plot. Our plot co
 The multivariate drift results provide a concise summary of where data drift
 is happening in our input data.
 
-.. _SimpleImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
-
-
 Insights
 --------
 
@@ -137,3 +149,6 @@ estimate the impact of the observed changes.
 
 For more information on how multivariate drift detection works, the
 :ref:`Data Reconstruction with PCA<data-reconstruction-pca>` explanation page gives more details.
+
+.. _`PCA constructor`: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
+.. _`SimpleImputer`: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
diff --git a/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst b/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst
index c210433e..fd1e3a4b 100644
--- a/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst
+++ b/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst
@@ -49,14 +49,34 @@ We begin by loading some synthetic data provided in the NannyML package. This is
 The :class:`~nannyml.drift.univariate.calculator.UnivariateDriftCalculator` class implements the functionality needed for univariate drift detection.
 First, we need to instantiate it with the appropriate parameters:
 
-- The names of the columns to be evaluated.
-- A list of methods to use on continuous columns. You can chose from :ref:`kolmogorov_smirnov<univ_cont_method_ks>`,
-  :ref:`jensen_shannon<univariate-drift-detection-cont-jensen-shannon>`, :ref:`wasserstein<univariate-drift-detection-cont-wasserstein>`
-  and :ref:`hellinger<univariate-drift-detection-cont-hellinger>`.
-- A list of methods to use on categorical columns. You can choose from :ref:`chi2<univ_cat_method_chi2>`, :ref:`jensen_shannon<univ_cat_method_js>`,
+- **column_names:** A list with the names of columns to be evaluated.
+- **treat_as_categorical (Optional):** A list of column names to treat as categorical columns.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **categorical_methods (Optional):** A list of methods to use on categorical columns.
+  You can choose from :ref:`chi2<univ_cat_method_chi2>`, :ref:`jensen_shannon<univ_cat_method_js>`,
   :ref:`l_infinity<univ_cat_method_l8>`, and :ref:`hellinger<univ_cat_method_hellinger>`.
-- Optionally, the name of the column containing the observation timestamps.
-- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default chunker creating 10 chunks will be used.
+- **continuous_methods (Optional):** A list of methods to use on continuous columns.
+  You can chose from :ref:`kolmogorov_smirnov<univ_cont_method_ks>`,
+  :ref:`jensen_shannon<univariate-drift-detection-cont-jensen-shannon>`,
+  :ref:`wasserstein<univariate-drift-detection-cont-wasserstein>`
+  and :ref:`hellinger<univariate-drift-detection-cont-hellinger>`.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **thresholds (Optional):** A dictionary allowing users to set a custom threshold strategy for each method.
+  It links a `Threshold` subclass to a method name.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
+- **computation_params (Optional):** A dictionary which allows users to specify whether they want drift calculated on
+  the exact reference data or an estimated distribution of the reference data obtained
+  using binning techniques. Applicable only to Kolmogorov-Smirnov and Wasserstein. For more information look
+  :class:`~nannyml.drift.univariate.calculator.UnivariateDriftCalculator`.
 
 .. nbimport::
     :path: ./example_notebooks/Tutorial - Drift - Univariate.ipynb
diff --git a/docs/tutorials/summary_stats/avg.rst b/docs/tutorials/summary_stats/avg.rst
index 5f4882fa..88874f14 100644
--- a/docs/tutorials/summary_stats/avg.rst
+++ b/docs/tutorials/summary_stats/avg.rst
@@ -36,12 +36,20 @@ The :class:`~nannyml.stats.avg.calculator.SummaryStatsAvgCalculator` class imple
 the functionality needed for mean values calculations.
 We need to instantiate it with appropriate parameters:
 
-- The names of the columns to be evaluated.
-- Optionally, the name of the column containing the observation timestamps.
-- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default
-  chunker creating 10 chunks will be used.
-- Optionally, a threshold strategy to modify the default one. See available threshold options
-  :ref:`here<thresholds>`.
+- **column_names:** A list with the names of columns to be evaluated.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
 
 .. nbimport::
     :path: ./example_notebooks/Tutorial - Stats - Avg.ipynb
diff --git a/docs/tutorials/summary_stats/count.rst b/docs/tutorials/summary_stats/count.rst
index ae2194c2..b3e22fbd 100644
--- a/docs/tutorials/summary_stats/count.rst
+++ b/docs/tutorials/summary_stats/count.rst
@@ -34,11 +34,19 @@ The :class:`~nannyml.stats.count.calculator.SummaryStatsRowCountCalculator` clas
 the functionality needed for row count calculations.
 We need to instantiate it with appropriate *optional* parameters:
 
-- The name of the column containing the observation timestamps.
-- A chunking approach or a predefined chunker. If neither is provided, the default
-  chunker creating 10 chunks will be used.
-- A threshold strategy to modify the default one. See available threshold options
-  :ref:`here<thresholds>`.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
 
 .. nbimport::
     :path: ./example_notebooks/Tutorial - Stats - Count.ipynb
diff --git a/docs/tutorials/summary_stats/median.rst b/docs/tutorials/summary_stats/median.rst
index d13fd454..c9d96d48 100644
--- a/docs/tutorials/summary_stats/median.rst
+++ b/docs/tutorials/summary_stats/median.rst
@@ -36,12 +36,20 @@ The :class:`~nannyml.stats.avg.calculator.SummaryStatsMedianCalculator` class im
 the functionality needed for median values calculations.
 We need to instantiate it with appropriate parameters:
 
-- The names of the columns to be evaluated.
-- Optionally, the name of the column containing the observation timestamps.
-- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default
-  chunker creating 10 chunks will be used.
-- Optionally, a threshold strategy to modify the default one. See available threshold options
-  :ref:`here<thresholds>`.
+- **column_names:** A list with the names of columns to be evaluated.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
 
 .. nbimport::
     :path: ./example_notebooks/Tutorial - Stats - Median.ipynb
diff --git a/docs/tutorials/summary_stats/std.rst b/docs/tutorials/summary_stats/std.rst
index 849a8e41..142bd8e1 100644
--- a/docs/tutorials/summary_stats/std.rst
+++ b/docs/tutorials/summary_stats/std.rst
@@ -36,12 +36,20 @@ The :class:`~nannyml.stats.std.calculator.SummaryStatsStdCalculator` class imple
 the functionality needed for standard deviation values calculations.
 We need to instantiate it with appropriate parameters:
 
-- The names of the columns to be evaluated.
-- Optionally, the name of the column containing the observation timestamps.
-- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default
-  chunker creating 10 chunks will be used.
-- Optionally, a threshold strategy to modify the default one. See available threshold options
-  :ref:`here<thresholds>`.
+- **column_names:** A list with the names of columns to be evaluated.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
 
 .. nbimport::
     :path: ./example_notebooks/Tutorial - Stats - Std.ipynb
diff --git a/docs/tutorials/summary_stats/sum.rst b/docs/tutorials/summary_stats/sum.rst
index da2c4179..634d9062 100644
--- a/docs/tutorials/summary_stats/sum.rst
+++ b/docs/tutorials/summary_stats/sum.rst
@@ -36,12 +36,20 @@ The :class:`~nannyml.stats.sum.calculator.SummaryStatsSumCalculator` class imple
 the functionality needed for sum values calculations.
 We need to instantiate it with appropriate parameters:
 
-- The names of the columns to be evaluated.
-- Optionally, the name of the column containing the observation timestamps.
-- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default
-  chunker creating 10 chunks will be used.
-- Optionally, a threshold strategy to modify the default one. See available threshold options
-  :ref:`here<thresholds>`.
+- **column_names:** A list with the names of columns to be evaluated.
+- **timestamp_column_name (Optional):** The name of the column in the reference data that
+  contains timestamps.
+- **chunk_size (Optional):** The number of observations in each chunk of data
+  used. Only one chunking argument needs to be provided. For more information about
+  :term:`chunking<Data Chunk>` configurations check out the :ref:`chunking tutorial<chunking>`.
+- **chunk_number (Optional):** The number of chunks to be created out of data provided for each
+  :ref:`period<data-drift-periods>`.
+- **chunk_period (Optional):** The time period based on which we aggregate the provided data in
+  order to create chunks.
+- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation
+  provided data in order to create chunks.
+- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits.
+  For more information about thresholds, check out the :ref:`thresholds tutorial<thresholds>`.
 
 .. nbimport::
     :path: ./example_notebooks/Tutorial - Stats - Sum.ipynb
diff --git a/nannyml/thresholds.py b/nannyml/thresholds.py
index 667b7de6..b39e7bff 100644
--- a/nannyml/thresholds.py
+++ b/nannyml/thresholds.py
@@ -264,7 +264,7 @@ def calculate_threshold_values(
     if (
         lower_threshold_value_limit is not None
         and lower_threshold_value is not None
-        and lower_threshold_value < lower_threshold_value_limit
+        and lower_threshold_value <= lower_threshold_value_limit
     ):
         override_value = None if override_using_none else lower_threshold_value_limit
         if logger:
@@ -277,7 +277,7 @@ def calculate_threshold_values(
     if (
         upper_threshold_value_limit is not None
         and upper_threshold_value is not None
-        and upper_threshold_value > upper_threshold_value_limit
+        and upper_threshold_value >= upper_threshold_value_limit
     ):
         override_value = None if override_using_none else upper_threshold_value_limit
         if logger: