diff --git a/requirements.txt b/requirements.txt index 54228512..17545d54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pandas==1.2.* pydantic==1.8.2 scikit-learn==0.24.2 +matplotlib==3.4.2 diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py index adefa214..a3337410 100644 --- a/src/ydata_quality/core/engine.py +++ b/src/ydata_quality/core/engine.py @@ -7,15 +7,17 @@ import pandas as pd from ydata_quality.core import QualityWarning from ydata_quality.core.warnings import Priority - +from ydata_quality.utils.modelling import infer_dtypes class QualityEngine(ABC): "Main class for running and storing data quality analysis." - def __init__(self, df: pd.DataFrame): + def __init__(self, df: pd.DataFrame, label: str = None, dtypes: dict = None): self._df = df self._warnings = set() self._tests = [] + self._label = label + self._dtypes = dtypes @property def df(self): @@ -27,6 +29,44 @@ def warnings(self): "Storage of all detected data quality warnings." return self._warnings + + @property + def label(self): + "Property that returns the label under inspection." + return self._label + + @label.setter + def label(self, label: str): + if not isinstance(label, str): + raise ValueError("Property 'label' should be a string.") + assert label in self.df.columns, "Given label should exist as a DataFrame column." + self._label = label + + @property + def dtypes(self): + "Infered dtypes for the dataset." + if self._dtypes is None: + self._dtypes = infer_dtypes(self.df) + return self._dtypes + + @dtypes.setter + def dtypes(self, dtypes: dict): + if not isinstance(dtypes, dict): + raise ValueError("Property 'dtypes' should be a dictionary.") + assert all(col in self.df.columns for col in dtypes), "All dtypes keys \ + must be columns in the dataset." + supported_dtypes = ['numerical', 'categorical'] + assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes\ + must be in the supported broad dtype list: {}.".format(supported_dtypes) + df_col_set = set(self.df.columns) + dtypes_col_set = set(dtypes.keys()) + missing_cols = df_col_set.difference(dtypes_col_set) + if missing_cols: + _dtypes = infer_dtypes(self.df, skip=df_col_set.difference(missing_cols)) + for col, dtype in _dtypes.items(): + dtypes[col] = dtype + self._dtypes = dtypes + def store_warning(self, warning: QualityWarning): "Adds a new warning to the internal 'warnings' storage." self._warnings.add(warning) diff --git a/src/ydata_quality/drift/__init__.py b/src/ydata_quality/drift/__init__.py new file mode 100644 index 00000000..7ea6b047 --- /dev/null +++ b/src/ydata_quality/drift/__init__.py @@ -0,0 +1,9 @@ +""" +Tools to check dataset for data drifting. +""" +from ydata_quality.drift.engine import DriftAnalyser, ModelWrapper + +__all__ = [ + "DriftAnalyser", + "ModelWrapper" +] diff --git a/src/ydata_quality/drift/engine.py b/src/ydata_quality/drift/engine.py new file mode 100644 index 00000000..ddac29de --- /dev/null +++ b/src/ydata_quality/drift/engine.py @@ -0,0 +1,356 @@ +""" +Implementation of DriftAnalyser engine to run data drift analysis. +""" +from typing import Callable, Optional, Union + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy.stats import ks_2samp +from scipy.stats._continuous_distns import chi2_gen +from ydata_quality.core import QualityEngine, QualityWarning +from ydata_quality.utils.modelling import infer_dtypes + + +class ModelWrapper: + """Base class for model wrapper. + Defines a Model instance to enable concept drift analysis with the Sampling engine. + This class is meant to cover all functionality needed to interact with the engine. + Can be instantiated directly or inherited from a custom class. + In normal use only the preprocess and postprocess methods should need override.""" + + def __init__(self, model: Callable): + self._model = model + + @property + def model(self): + """Passes the provided callable as the property model.""" + return self._model + + @staticmethod + def _preprocess(x: pd.DataFrame): + """Performs any preprocessing of the model input. + By default returns input without any transformation. + Override to define custom preprocessing steps.""" + return x + + @staticmethod + def _postprocess(y: pd.Series): + """Performs any postprocessing of the models label predictions. + By default returns input without any transformation. + Override to define custom model predictions postprocessing steps.""" + return y + + def _predict(self, x: pd.DataFrame): + """Runs the provided callable model on pretransformed input.""" + if hasattr(self.model, "predict"): # Sklearn and tensorflow model standards + return self.model.predict(x) + else: # Pytorch and other __call__ prediction standards + return self.model(x) + + def __call__(self, x: pd.DataFrame) -> pd.Series: + """Returns a sample of labels predicted by the model from the covariate sample x. + The returned Series is expected to have the same number of rows as x.""" + transformed_x = self._preprocess(x) + raw_y = self._predict(transformed_x) + return self._postprocess(raw_y) + + +class DriftAnalyser(QualityEngine): + """Main class to run drift test analysis. + + Methods: + ref_covariate_drift: controls covariate drift in reference subsamples. + ref_label_drift: controls label drift in the reference subsamples. + sample_covariate_drift: detects covariate drift in the test sample, measured against the full reference sample. + sample_label_drift: detects label drift in the test sample, measured against the full reference sample. + sample_concept_drift: detects concept drift in the test sample based on a wrapped model provided by the user. + """ + + def __init__(self, ref: pd.DataFrame, sample: Optional[pd.DataFrame] = None, + label: Optional[str] = None, model: Callable = None, holdout_size: float = 0.2): + """ + Initializes the engine properties and lists tests for automated evaluation. + Args: + ref (pd.DataFrame): reference sample used to run sampling analysis, ideally the users dataset or a train dataset. + sample (Optional, pd.DataFrame): sample to test drift against the reference sample, can be new data, a slice of the train dataset or a test sample. + label (Optional, str): defines a feature in the provided samples as label. + model (Optional, ModelWrapper): a custom model wrapped by the ModelWrapper class. The model is expected to perform label prediction over the set of features (covariates) of the provided samples. + holdout_size (float): Fraction to be kept as holdout for drift test. + """ + super().__init__(df=ref, label=label) + self.sample = sample + self._model = model + self.has_model = None + self._holdout, self._leftover = self._random_split(ref, holdout_size) + self._tests = ['ref_covariate_drift', 'ref_label_drift', 'sample_covariate_drift', + 'sample_label_drift', 'sample_concept_drift'] + + @property + def sample(self): + "Returns the user provided test sample." + return self._sample + + @sample.setter + def sample(self, sample: pd.DataFrame): + if sample is not None: + assert sorted(list(sample.columns)) == sorted(list(self.df.columns)), "The reference and independent samples must share schema." + self._sample = sample + + @property + def has_model(self): + return self._has_model + + @has_model.setter + def has_model(self, _): + try: + self._has_model = self.__test_model() + except AssertionError: + print("The provided model failed to produce output in the expected format during test and will not be used by the engine.") + self._has_model = False + except: + self._has_model = False + + def __test_model(self): + """Tests the provided model wrapper. + Creates an example input from the provided samples. + A valid test output is a label series with the same number of rows as x. + Raises AssertionError if the model test fails. + Raises a general exception if the conditions for test were not met. + Please remove eventual label column from test_x before passing it to the test.""" + if self.label and self._model is not None: + test_x = self.df.head().copy() + test_x.drop(self.label, axis=1, inplace=True) + output = self._model(test_x) + assert isinstance(output, (pd.Series, np.ndarray)), "The provided model failed to produce the expected output." + assert len(output) == test_x.shape[0], "The provided model failed to produce output with the expected dimensionality." + return True + raise Exception + + @staticmethod + def _random_split(sample: Union[pd.DataFrame, pd.Series], split_size: float, shuffle=True): + """Shuffles sample and splits it into 2 partitions according to split_size. + Returns a tuple with the split first (partition corresponding to split_size, and remaining second). + Args: + sample (pd.DataFrame): A sample to be split + split_size (float): Fraction of the sample to be taken split + shuffle (bool): If True shuffles sample rows before splitting""" + assert 0<= split_size <=1, 'split_size must be a fraction, i.e. a float in the [0,1] interval.' + if shuffle: + sample = sample.sample(frac=1) # Shuffle dataset rows + split_len = int(sample.shape[0]*split_size) + split = sample.iloc[:split_len] + remainder = sample.iloc[split_len:] + return split, remainder + + @staticmethod + def _chisq_2samp(reference_data: pd.Series, test_data: pd.Series): + """Asserts validity of performing chisquared test on two samples. + Tests the hypothesis that the test_sample follows ref_sample's distribution. + Will raise an AssertionError in case the test is not valid. + Args: + reference_data (pd.Series): Reference data, used to compute degrees of freedom and expectation + test_data (pd.Series): Test data, compared to the reference data + Returns: + chi_stat (float): The chi squared statistic of this test + p_val (float): The p-value of the tested hypothesis + """ + ref_unique_freqs = reference_data.value_counts(normalize=True) + test_unique_counts = test_data.value_counts() + assert set(test_unique_counts.index).issubset(set(ref_unique_freqs.index)),"test_sample contains categories unknown to the ref_sample" + test_expected_counts = ref_unique_freqs*len(test_data) + assert sum(test_expected_counts<5)==0, "The test sample has categories with expected count below 5 (this sample is too small for chi-squared test)" + chi_stat = sum(((test_unique_counts-test_expected_counts)**2)/test_expected_counts) + p_val = 1-chi2_gen().cdf(x=chi_stat, df=len(ref_unique_freqs-1)) + return chi_stat, p_val + + def _2sample_feat_goof(self, ref_sample: pd.Series, test_sample: pd.Series): + """Performs a goodness of fit test between 2 samples. + The column dtype of the samples allows for an appropriate statistic test selection. + Returns tuple (statistic_value, p_value, test_name). + If the statistic test raises an exception, (-1, None, test_name) is returned instead. + Args: + ref_sample (pd.Series): Reference sample (Relevant distinction for chi-squared test) + test_sample (pd.Series): Test sample""" + statistics = {'categorical': ('Chi-Squared', self._chisq_2samp), + 'numerical': ('Kolmogorov-Smirnov', ks_2samp)} + feat_dtype = self.dtypes[ref_sample.name] + test_name, test = statistics[feat_dtype] + try: + statistic_value, p_value = test(ref_sample, test_sample) + except: + statistic_value, p_value = -1, None + return statistic_value, p_value, test_name + + def ref_covariate_drift(self, p_thresh: float= 0.05): + """Controls covariate drift in reference subsamples. + The controlled metric is the number of features with no drift detection. + This % is plotted against the size of the reference subsample. + A monotonic increase of the value is expected as the subsample size is increased. + The dtypes are used to decide the test to be applied per column (chi squared or KS). + The p-value threshold is adjusted for the multivariate case via Bonferroni correction. + Args: + p_thresh (float): The p_threshold used for the test. + """ + covariates = self._leftover.copy() + holdout = self._holdout.copy() + if self.label: + covariates.drop(self.label, axis=1, inplace=True) + holdout.drop(self.label, axis=1, inplace=True) + leftover_fractions = np.arange(0.2, 1.2, 0.2) + perc_index = ["{0:.0%}".format(fraction) for fraction in leftover_fractions] + control_metric = pd.Series(index=perc_index) + bonferroni_p = p_thresh/len(covariates.columns) # Bonferroni correction + all_p_vals = pd.DataFrame(index=perc_index, columns=covariates.columns) + for i, fraction in enumerate(leftover_fractions): + downsample, _ = self._random_split(covariates, fraction) + p_vals = [] + for column in covariates.columns: + _, p_val, _ = self._2sample_feat_goof(ref_sample = downsample[column], + test_sample = holdout[column]) + p_vals.append(p_val) + all_p_vals.iloc[i] = p_vals + control_metric.iloc[i] = 100*len([p for p in p_vals if p > bonferroni_p])/len(p_vals) + all_p_vals['Corrected p-value threshold'] = bonferroni_p + control_metric.plot(title='Reference sample covariate features no drift(%)', + xlabel='Percentage of remaining sample used', + ylabel='Percentage of no drift features', + ylim = (0, 104), style='.-') + plt.show() + return all_p_vals + + def ref_label_drift(self, p_thresh: float= 0.05): + """Controls label drift in the reference sample (df). + The p-value of the test is plotted against the size of the reference subsample. + A monotonic increase of this metric is expected as we increase the subsample size. + The dtype is used to decide the test to be applied to the label (chi squared or KS). + Args: + p_thresh (float): The p_threshold used for the test.""" + if self.label is None: + return "[REFERENCE LABEL DRIFT] No label was provided. Test skipped." + labels = self._leftover[self.label].copy() + holdout = self._holdout[self.label] + leftover_fractions = np.arange(0.2, 1.2, 0.2) + p_values = pd.DataFrame(index=["{0:.0%}".format(fraction) for fraction in leftover_fractions], + columns=['Label p-value', 'p-value threshold']) + for i, fraction in enumerate(leftover_fractions): + downsample, _ = self._random_split(labels, fraction) + _, p_val, test_name = self._2sample_feat_goof(ref_sample = downsample, + test_sample = holdout) + p_values['Label p-value'].iloc[i] = p_val + p_values['p-value threshold'] = p_thresh + p_values.plot(title='Reference sample label p-values', + xlabel='Percentage of remaining sample used', + ylabel=f'{test_name} test p-value', style='.-') + plt.show() + + def sample_covariate_drift(self, p_thresh: float= 0.05): + """Detects covariate drift in the test sample (measured against the full reference sample). + The p-value threshold is adjusted for the multivariate case via Bonferroni correction. + Any p-value below the adjusted threshold indicates test sample drift, raising a warning. + The dtypes are used to decide the test to be applied per column (chi squared or KS). + Args: + p_thresh (float): The p_threshold used for the test. + """ + if self.sample is None: + return "[SAMPLE LABEL DRIFT] To run sample covariate drift, a test sample must be provided. Test skipped." + covariates = self.df.copy() + test_sample = self.sample.copy() + if self.label: + covariates.drop(self.label, axis=1, inplace=True) + test_sample.drop(self.label, axis=1, inplace=True) + bonferroni_p = p_thresh/len(covariates.columns) # Bonferroni correction + test_summary = pd.DataFrame(index=covariates.columns, + columns=['Statistic', 'Statistic Value', 'p-value', 'Verdict']) + for column in covariates.columns: + stat_val, p_val, test_name = self._2sample_feat_goof(ref_sample = covariates[column], + test_sample = test_sample[column]) + test_summary.loc[column] = [test_name, stat_val, p_val, None] + test_summary['Verdict'] = test_summary['p-value'].apply( + lambda x: 'OK' if x > bonferroni_p else ('Drift' if x>= 0 else 'Invalid test')) + n_drifted_feats = sum(test_summary['Verdict']=='Drift') + n_invalid_tests = sum(test_summary['Verdict']=='Invalid test') + if n_drifted_feats>0: + self._warnings.add( + QualityWarning( + test='Sample covariate drift', category='Sampling', priority=2, data=test_summary, + description=f"""{n_drifted_feats} features accused drift in the sample test. The covariates of the test sample do not appear to be representative of the reference sample.""" + )) + elif n_invalid_tests>0: + self._warnings.add( + QualityWarning( + test='Sample covariate drift', category='Sampling', priority=3, data=test_summary, + description=f"""There were {n_invalid_tests} invalid tests found. This is likely due to a small test sample size. The data summary should be analyzed before considering the test conclusive.""" + )) + else: + print("[SAMPLE COVARIATE DRIFT] Covariate drift was not detected in the test sample.") + return test_summary + + def sample_label_drift(self, p_thresh: float= 0.05): + """Detects label drift in the test sample (measured against the full reference sample). + A p-value below the adjusted threshold indicates test sample drift, raising a warning. + The label dtype is used to decide the test to be applied (chi squared or KS). + Args: + p_thresh (float): The p_threshold used for the test. + """ + if self.sample is None or self.label is None or self.label not in self.sample.columns: + return "[SAMPLE LABEL DRIFT] To run sample label drift, a test sample must be provided with the defined label column. Test skipped." + labels = self.df[self.label].copy() + test_sample = self.sample[self.label].copy() + stat_val, p_val, test_name = self._2sample_feat_goof(ref_sample = labels, + test_sample = test_sample) + test_summary = pd.Series(data=[test_name, stat_val, p_val, None], + index=['Statistic', 'Statistic Value', 'p-value', 'Verdict']) + test_summary['Verdict'] = 'OK' if p_val > p_thresh else ('Drift' if p_val>= 0 else 'Invalid test') + if test_summary['Verdict']=='Drift': + self._warnings.add( + QualityWarning( + test='Sample label drift', category='Sampling', priority=2, data=test_summary, + description=f"""The label accused drift in the sample test with a p-test of {p_val}, which is under the threshold {p_thresh}. The label of the test sample does not appear to be representative of the reference sample.""" + )) + elif test_summary['Verdict']=='Invalid test': + self._warnings.add( + QualityWarning( + test='Sample label drift', category='Sampling', priority=3, data=test_summary, + description=f"""The test was invalid. This is likely due to a small test sample size.""" + )) + else: + print("[SAMPLE LABEL DRIFT] Label drift was not detected in the test sample.") + return test_summary + + def sample_concept_drift(self, p_thresh: float= 0.05): + """Detects concept drift in the test sample resorting to a user provided model wrapper. + Results may not be conclusive without first testing if the test sample has label or covariate drift. + A p-value below the adjusted threshold indicates test sample concept drift, raising a warning. + The label dtype is used to decide the test to be applied (chi squared or KS). + Args: + p_thresh (float): The p_threshold used for the test. + """ + if not self.has_model or self.sample is None: + return "[CONCEPT DRIFT] To run concept drift, a valid model, a test sample and label column must be provided. Test skipped." + ref_sample = self.df.copy() + test_sample = self.sample.copy() + ref_sample.drop(self.label, axis=1, inplace=True) + test_sample.drop(self.label, axis=1, inplace=True) + ref_preds = pd.Series(self._model(ref_sample), name=self.label) + test_preds = pd.Series(self._model(test_sample), name=self.label) + stat_val, p_val, test_name = self._2sample_feat_goof(ref_sample = ref_preds, + test_sample = test_preds) + test_summary = pd.Series(data=[test_name, stat_val, p_val, None], + index=['Statistic', 'Statistic Value', 'p-value', 'Verdict']) + test_summary['Verdict'] = 'OK' if p_val > p_thresh else ('Drift' if p_val>= 0 else 'Invalid test') + if test_summary['Verdict']=='Drift': + self._warnings.add( + QualityWarning( + test='Concept drift', category='Sampling', priority=2, data=test_summary, + description=f"""There was concept drift detected with a p-test of {p_val}, which is under the threshold {p_thresh}. The model's predicted labels for the test sample do not appear to be representative of the distribution of labels predicted for the reference sample.""" + )) + elif test_summary['Verdict']=='Invalid test': + self._warnings.add( + QualityWarning( + test='Concept drift', category='Sampling', priority=3, data=test_summary, + description=f"""The test was invalid. This is likely due to a small test sample size.""" + )) + else: + print("[CONCEPT DRIFT] Concept drift was not detected between the reference and the test samples.") + return test_summary diff --git a/src/ydata_quality/labelling/engine.py b/src/ydata_quality/labelling/engine.py index cc6487d9..a1a44507 100644 --- a/src/ydata_quality/labelling/engine.py +++ b/src/ydata_quality/labelling/engine.py @@ -26,46 +26,9 @@ class SharedLabelInspector(QualityEngine): """Shared structure for Numerical/Categorical Label Inspector""" def __init__(self, df: pd.DataFrame, label: str): - super().__init__(df) # Runs init from the Quality Engine - self._label = label - self._dtypes = infer_dtypes(self.df) + super().__init__(df=df, label=label) self._tdf = None - @property - def label(self): - "Property that returns the label under inspection." - return self._label - - @label.setter - def label(self, label: str): - if not isinstance(label, str): - raise ValueError("Property 'label' should be a string.") - assert label in self.df.columns, "Given label should exist as a DataFrame column." - self._label = label - - @property - def dtypes(self): - "Property that returns infered dtypes for the dataset." - return self._dtypes - - @dtypes.setter - def dtypes(self, dtypes: dict): - if not isinstance(dtypes, dict): - raise ValueError("Property 'dtypes' should be a dictionary.") - assert all(col in self.df.columns for col in dtypes), "All dtypes keys \ - must be columns in the dataset." - supported_dtypes = ['numerical', 'categorical'] - assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes\ - must be in the supported broad dtype list: {}.".format(supported_dtypes) - df_col_set = set(self.df.columns) - dtypes_col_set = set(dtypes.keys()) - missing_cols = df_col_set.difference(dtypes_col_set) - if missing_cols: - _dtypes = infer_dtypes(self.df, skip=df_col_set.difference(missing_cols)) - for col, dtype in _dtypes.items(): - dtypes[col] = dtype - self._dtypes = dtypes - @property def tdf(self): "Property that returns the transformed dataset centroids for all (not nan) classes." @@ -113,7 +76,7 @@ class CategoricalLabelInspector(SharedLabelInspector): Ordinal labels can be handled if passed as categorical.""" def __init__(self, df: pd.DataFrame, label: str): - super().__init__(df, label) + super().__init__(df=df, label=label) self._centroids = None self._tests = ["missing_labels", "few_labels", "unbalanced_classes", "one_vs_rest_performance", "outlier_detection"] diff --git a/tutorials/drift.ipynb b/tutorials/drift.ipynb new file mode 100644 index 00000000..3a8b3e6b --- /dev/null +++ b/tutorials/drift.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# YData Quality - Data Drift Tutorial\n", + "Time-to-Value: 10 minutes\n", + "\n", + "This notebook provides a tutorial for the ydata_quality package funcionality for drift analysis.\n", + "\n", + "**Structure:**\n", + "\n", + "0. A data drifting introduction\n", + "1. Load dataset\n", + "2. Train and wrap example model\n", + "3. Distort dataset\n", + "4. Instantiate the Data Quality engine\n", + "5. Run the quality checks\n", + "6. Assess the warnings\n", + "7. (Extra) Detailed overview" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A data drifting introduction\n", + "### What is data drift?\n", + "Data drift is a broad term used for differences in the data observed by a model during training and prediction time.\n", + "\n", + "These divergences are part of most real world settings and can hinder the task of learning a mapping from input (features) space to output space (labels).\n", + "Sometimes these divergences are intended, such as in __[Domain Adaptation](https://en.wikipedia.org/wiki/Domain_adaptation)__ applications where the success is defined by the model's ability to learn from a rich, source dataset and generalize this capacity to a target dataset where labelled data can be scarce or unavailable.\n", + "On other cases, the differences are unintended and reflect the dynamic nature of the underlying use case (e.g. stock prices) or are just a reflex of the difficulty of creating a representative dataset of the target population.\n", + "\n", + "For the three scenarios presented below, drift is an important concept and a pivotal aspect of Data Quality.\n", + "***\n", + "### Types of data drift\n", + "We identify and diagnose the three main types of data drift:\n", + "* **Covariate, input drift (X)**\n", + " * Drift detected in the independent features between a reference sample (e.g. train data) and a target sample (p.e. test data). In an ideal scenario we expect $X_{ref}$ = $X_{target}$, which means that, statistically, input data from the target appears to be generated from the same distribution as the reference data.\n", + "* **Label drift (Y)**\n", + " * Drift detected in the dependent feature between a reference sample and a target sample. In an ideal scenario we expect $Y_{ref}$ = $Y_{target}$, which means that, statistically, labels from the reference sample appear to be generated from the same distribution.\n", + "* **Concept drift (Y|X)**\n", + " * Drift detected in the input/output mappings defined by a model between a reference and a target sample. This gains more relevance if we can first establish that there is no covariate or label drift. In an ideal scenario we expect $Y_{ref}|X_{ref}$ = $Y_{target}|X_{target}$, which means that, statistically, the model appears to follow the same mapping process in predicting the labels from the reference samples.\n", + "***\n", + "### How do we detect data drift\n", + "There are many strategies to detect data drift. Some strategies rely on fitting a classifier to distinguish which observations come from a reference sample and define the existence of drift based on the performance of this classifier (a bad performance suggests a low data drift). Other strategies rely on empirical distance metrics like the two sample __[Maximum Mean Discrepancy](https://jmlr.csail.mit.edu/papers/volume13/gretton12a/gretton12a.pdf)__ or on classical statistical tests and hypothesis testing.\n", + "\n", + "To depend on the least assumptions, we choose the latter alternative. For all numerical features we apply a __[Kolmogorov-Smirnov test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)__ and for categorical data we use the __[Chi-Squared test](https://en.wikipedia.org/wiki/Chi-squared_test)__. The standard p-value thresholds are used to determine the outcome of the performed tests. For concept drift we abstain from trying to fit a model and evaluating drift with that model, because that is no guarantee of similar behaviour by a user's model. For this reason we accomodate user provided models and analyse the model behaviour for the intended task which should provide the most useful insights.\n", + "\n", + "Since input data (X) is often multi-dimensional, we agglomerate the results of multiple statistical tests by applying a __[Bonferroni correction](https://en.wikipedia.org/wiki/Bonferroni_correction)__ to the p-value. This allows for a global assessment of drift from many individual statistical tests. So, when we say that there is covariate drift, we mean that at least one of the features has failed a statistical test with higher likelihood than our Bonferroni corrected threshold.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Drift engine demo and tutorial\n", + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import datasets\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from ydata_quality.drift import DriftAnalyser, ModelWrapper\n", + "import random" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the example dataset\n", + "We will use the wine dataset available from the sklearn package." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = datasets.load_wine(as_frame=True)['frame']\n", + "df['target'] = df['target'].apply(str) # Forcing categorical type inferral on target feature" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train and wrap example model\n", + "We will train a simple classifier on the full dataset and wrap it with the ModelWrapper class just to demonstrate the engine's concept drift test." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# First we train a classifier\n", + "x = df.loc[:, df.columns != 'target']\n", + "y = df['target']\n", + "clf = DecisionTreeClassifier(random_state=0)\n", + "clf.fit(x, y)\n", + "\n", + "# Now we wrap it by directly instantiating the ModelWrapper class.\n", + "# The wrapper will handle using __call__ or predict method depending on the passed model\n", + "wrapped_model = ModelWrapper(clf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distort the original dataset\n", + "Apply transformations to highlight the data quality functionalities." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def create_sample(df, frac=0.2):\n", + " # Forge a sample\n", + " sample = df.sample(frac=frac)\n", + " # Distort sample values\n", + " sample['alcohol'] = sample['alcohol'] + 0.8\n", + " sample['target'] = random.choices(['0', '1', '2'], weights=[0.7, 0.2, 0.1], k=len(sample))\n", + " return sample\n", + "\n", + "sample = create_sample(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the engine\n", + "Each engine contains the checks and tests for each suite. To create a DriftAnalyser, you provide:\n", + "- ref: reference DataFrame, DataFrame that we will assume as the reference for the modelled population\n", + "- sample (optional): A test sample which we will compare against the reference. It should have the same schema as the reference dataframe, although the label column can always be optional for this sample (even when provided for the reference)\n", + "- label (optional): A string defining the label feature (will be searched for both in the reference and test samples)\n", + "- model (optional): A callable that inherits or directly instantiates ModelWrapper. This is supposed to wrap a label prediction model that consumes the provided sample covariates as input\n", + "- holdout_size (optional): A fraction defining the percentage of rows from the reference sample that are held-out for the reference tests. A 20% random subsample is taken by default." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "da = DriftAnalyser(ref=df, sample=sample, label='target', model=wrapped_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Full Evaluation\n", + "The easiest way to assess the data quality analysis is to run `.evaluate()` which returns a list of warnings for each quality check. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CONCEPT DRIFT] Concept drift was not detected between the reference and the test samples.\n" + ] + } + ], + "source": [ + "results = da.evaluate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check the status\n", + "After running the data quality checks, you can check the warnings for each individual test. The warnings are suited by priority and have additional details that can provide better insights for Data Scientists." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[SAMPLE LABEL DRIFT] The label accused drift in the sample test with a p-test of 0.011552904433326017, which is under the threshold 0.05. The label of the test sample does not appear to be representative of the reference sample. (Priority 2: usage allowed, limited human intelligibility)\n", + "[SAMPLE COVARIATE DRIFT] 1 features accused drift in the sample test. The covariates of the test sample do not appear to be representative of the reference sample. (Priority 2: usage allowed, limited human intelligibility)\n" + ] + } + ], + "source": [ + "da.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Full Test Suite\n", + "In this section, you will find a detailed overview of the available tests in the Data Drift module of ydata_quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reference sample covariate drift\n", + "\n", + "In this test we look for evidence of the reference sample covariates being representative of the underlying population.\n", + "A holdout is taken (20% by default), and increasing size random slices of data are taken from the leftover 80% data.\n", + "The leftover slices are tested against the holdout in attempt to provide drift evidence.\n", + "\n", + "Due to the complexity of this strategy, we provide the tooling for Data Scientists to infer the healthiness of the reference sample and avoid drawing conclusions automatically based on heuristics. An healthy indicator of data quality would be a monotonic increase of the percentage of features with no drift evidence and increasing individual p-values for the least performant tests, as the leftover slices are increased." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ref_cov_drift_out = da.ref_covariate_drift()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reference sample label drift\n", + "\n", + "In this test we look for evidence of the reference sample labels being representative of the underlying population.\n", + "The same holdout and undersampling strategy from the previous test are used and, since now we have a univariate test, only the p-values for the increasing leftover slices are shown." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Here we notice the effects of changing all labels in the test sample to a fixed class\n", + "ref_label_drift_out = da.ref_label_drift()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test sample covariate drift\n", + "\n", + "In this test we look for evidence of the test sample covariates drifting from the reference sample.\n", + "The full reference sample is used in modelling the distribution against which we evaluate the test sample.\n", + "\n", + "In the multivariate case, Bonferroni correction for the p-value is used in order to mitigate eventual false positives in the drift diagnosis.\n", + "If any feature from the covariates accuses drift, we raise a warning for the full sample.\n", + "In the case that no drift was detected, but there were failing tests due to other data issues (e.g. different support on categoricals), we will raise your attention with a lower priority warning and with recommendations (usually this is due to too small test samples)." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StatisticStatistic Valuep-valueVerdict
alcoholKolmogorov-Smirnov0.4875160.0Drift
malic_acidKolmogorov-Smirnov0.1345190.59822OK
ashKolmogorov-Smirnov0.1398250.549066OK
alcalinity_of_ashKolmogorov-Smirnov0.1254680.680859OK
magnesiumKolmogorov-Smirnov0.0873910.957606OK
\n", + "
" + ], + "text/plain": [ + " Statistic Statistic Value p-value Verdict\n", + "alcohol Kolmogorov-Smirnov 0.487516 0.0 Drift\n", + "malic_acid Kolmogorov-Smirnov 0.134519 0.59822 OK\n", + "ash Kolmogorov-Smirnov 0.139825 0.549066 OK\n", + "alcalinity_of_ash Kolmogorov-Smirnov 0.125468 0.680859 OK\n", + "magnesium Kolmogorov-Smirnov 0.087391 0.957606 OK" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# As expected the corrupted alcohol feature is detected after the corruption step, a small boost of 0.8 vol(%) triggered this alarm\n", + "sample_cov_drift_out = da.sample_covariate_drift()\n", + "sample_cov_drift_out.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test sample label drift\n", + "\n", + "In this test we look for evidence of the test sample label drifting from the reference sample.\n", + "The same strategy from the previous step applies.\n", + "If the label accuses drift, we raise a warning for the test sample.\n", + "In the case that no drift was detected, but there was a failing test due to other data issues (e.g. different support on categoricals), we will raise your attention with a lower priority warning and a recommendation (usually this is due to too small test samples)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Statistic Chi-Squared\n", + "Statistic Value 11.032223\n", + "p-value 0.011553\n", + "Verdict Drift\n", + "dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_label_drift_out = da.sample_label_drift()\n", + "sample_label_drift_out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test concept drift\n", + "\n", + "In this test we look for evidence of concept drift.\n", + "If the provided model produces output for the test sample which appears not to be representative of the output generated from the reference sample, concept drift is detected." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CONCEPT DRIFT] Concept drift was not detected between the reference and the test samples.\n" + ] + }, + { + "data": { + "text/plain": [ + "Statistic Chi-Squared\n", + "Statistic Value 0.538533\n", + "p-value 0.910348\n", + "Verdict OK\n", + "dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_concept_drift_out = da.sample_concept_drift()\n", + "sample_concept_drift_out" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "cdc2bce73c2a9ac283f602628cabf735dbe06c4ee87a7849fc5f3d1177c8f304" + }, + "kernelspec": { + "display_name": "Python 3.8.10 64-bit ('.venv': venv)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "metadata": { + "interpreter": { + "hash": "cdc2bce73c2a9ac283f602628cabf735dbe06c4ee87a7849fc5f3d1177c8f304" + } + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file