diff --git a/src/ydata_quality/__init__.py b/src/ydata_quality/__init__.py index 886255b2..f5a28591 100644 --- a/src/ydata_quality/__init__.py +++ b/src/ydata_quality/__init__.py @@ -1,5 +1,11 @@ """ YData open-source lib for Data Quality. """ +from ydata_quality.core.data_quality import DataQuality from .version import __version__ + + +__all__ = [ + "DataQuality" +] diff --git a/src/ydata_quality/core/__init__.py b/src/ydata_quality/core/__init__.py index b09e9065..44065e11 100644 --- a/src/ydata_quality/core/__init__.py +++ b/src/ydata_quality/core/__init__.py @@ -2,10 +2,10 @@ Core functionality for Data Quality analysis. """ -from ydata_quality.core.warnings import QualityWarning from ydata_quality.core.engine import QualityEngine +from ydata_quality.core.warnings import QualityWarning __all__ = [ - "QualityWarning", - "QualityEngine" + "QualityEngine", + "QualityWarning" ] diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py index 680c2a5c..5ec5fe11 100644 --- a/src/ydata_quality/core/data_quality.py +++ b/src/ydata_quality/core/data_quality.py @@ -1,20 +1,98 @@ """ Implementation of main class for Data Quality checks. """ +from typing import List, Union, Optional, Callable import pandas as pd +from ydata_quality.core.warnings import QualityWarning, Priority +from ydata_quality.duplicates import DuplicateChecker +from ydata_quality.labelling import LabelInspector +from ydata_quality.missings import MissingsProfiler +from ydata_quality.valued_missing_values import VMVIdentifier +from ydata_quality.drift import DriftAnalyser + class DataQuality: - "DataQuality gathers the multiple data quality engines." + "DataQuality contains the multiple data quality engines." + + def __init__(self, + df: pd.DataFrame, + label: str = None, + entities: List[Union[str, List[str]]] = [], + vmv_extensions: Optional[list]=[], + sample: Optional[pd.DataFrame] = None, + model: Callable = None + ): + """ + Engines: + - Duplicates + - Missing Values + - Labelling + - Valued Missing Values + - Drift Analysis - def __init__(self, df: pd.DataFrame): + Args: + df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis. + label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted. + If not specified, LABELLING is skipped. + entities: [DUPLICATES] entities relevant for duplicate analysis. + vmv_extensions: [VALUED MISSING VALUES] A list of user provided valued missing values to append to defaults. + sample: [DRIFT ANALYSIS] data against which drift is tested. + model: [DRIFT ANALYSIS] model wrapped by ModelWrapper used to test concept drift. + """ self.df = df + self._warnings = set() + self._engines = { # Default list of engines + 'duplicates': DuplicateChecker(df=df, entities=entities), + 'missings': MissingsProfiler(df=df, target=label), + 'valued-missing-values': VMVIdentifier(df=df, vmv_extensions=vmv_extensions), + 'drift-analysis': DriftAnalyser(ref=df, sample=sample, label=label, model=model) + } + + # Engines based on mandatory arguments + if label is not None: + self._engines['labelling'] = LabelInspector(df=df, label=label) + else: + print('Label is not defined. Skipping LABELLING engine.') + + + @property + def warnings(self): + "Set of warnings generated by individual QualityEngines." + return self._warnings + + def get_warnings(self, + category: Optional[str] = None, + test: Optional[str] = None, + priority: Optional[Priority] = None) -> List[QualityWarning]: + "Retrieves warnings filtered by their properties." + filtered = list(self.warnings) # convert original set + filtered = [w for w in filtered if w.category == category] if category else filtered + filtered = [w for w in filtered if w.test == test] if test else filtered + filtered = [w for w in filtered if w.priority == Priority(priority)] if priority else filtered + filtered.sort() # sort by priority + return filtered + + @property + def engines(self): + "Dictionary of instantiated engines to run data quality analysis." + return self._engines + + def __store_warnings(self): + "Appends all warnings from individiual engines into warnings of DataQuality main class." + for engine in self.engines.values(): + self._warnings = self._warnings.union(set(engine.get_warnings())) def evaluate(self): "Runs all the individual data quality checks and aggregates the results." - raise NotImplementedError - + results = {name: engine.evaluate() for name, engine in self.engines.items()} + self.__store_warnings() + return results def report(self): - "Returns a full list of warnings retrieved during the Data Quality checks." - raise NotImplementedError + "Prints a report containing all the warnings detected during the data quality analysis." + # TODO: Provide a count of warnings by priority + self.__store_warnings() # fetch all warnings from the engines + for warn in self.get_warnings(): + print(warn) + diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py index a3337410..087aa722 100644 --- a/src/ydata_quality/core/engine.py +++ b/src/ydata_quality/core/engine.py @@ -5,8 +5,7 @@ from typing import Optional import pandas as pd -from ydata_quality.core import QualityWarning -from ydata_quality.core.warnings import Priority +from ydata_quality.core.warnings import QualityWarning, Priority from ydata_quality.utils.modelling import infer_dtypes class QualityEngine(ABC): @@ -76,11 +75,12 @@ def get_warnings(self, test: Optional[str] = None, priority: Optional[Priority] = None): "Retrieves warnings filtered by their properties." - filtered = self.warnings # original set + filtered = list(self.warnings) # convert original set filtered = [w for w in filtered if w.category == category] if category else filtered filtered = [w for w in filtered if w.test == test] if test else filtered filtered = [w for w in filtered if w.priority == Priority(priority)] if priority else filtered - return set(filtered) + filtered.sort() # sort by priority + return filtered @property def tests(self): @@ -97,4 +97,11 @@ def report(self): def evaluate(self): "Runs all the indidividual tests available within the same suite. Returns a dict of (name: results)." self._warnings = set() # reset the warnings to avoid duplicates - return {test: getattr(self, test)() for test in self.tests} + results = {} + for test in self.tests: + try: # if anything fails + results[test] = getattr(self, test)() + except Exception as exc: # print a Warning and log the message + print(f'WARNING: Skipping test {test} due to failure during computation.') + results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}" + return results diff --git a/src/ydata_quality/drift/engine.py b/src/ydata_quality/drift/engine.py index ddac29de..d0a31828 100644 --- a/src/ydata_quality/drift/engine.py +++ b/src/ydata_quality/drift/engine.py @@ -9,7 +9,6 @@ from scipy.stats import ks_2samp from scipy.stats._continuous_distns import chi2_gen from ydata_quality.core import QualityEngine, QualityWarning -from ydata_quality.utils.modelling import infer_dtypes class ModelWrapper: diff --git a/src/ydata_quality/valued_missing_values/engine.py b/src/ydata_quality/valued_missing_values/engine.py index 2bb66f84..1039a423 100644 --- a/src/ydata_quality/valued_missing_values/engine.py +++ b/src/ydata_quality/valued_missing_values/engine.py @@ -24,7 +24,7 @@ def __init__(self, df: pd.DataFrame, vmv_extensions: Optional[list]=[]): self._flatline_index = {} self.__default_index_name = '__index' self.vmvs = vmv_extensions - + @property def default_vmvs(self): """Returns the default list of Valued Missing Values. @@ -61,7 +61,7 @@ def __get_flatline_index(self, column_name: str, th: Optional[int] = 1): if column_name == self.__default_index_name: df[self.__default_index_name] = df.index # Index now in columns to be processed next column = df[column_name] - column.fillna('__filled') # So NaN values are considered + column.fillna('__filled') # So NaN values are considered sequence_indexes = column.ne(column.shift()).cumsum() # Everytime shifted value is different from previous a new sequence starts sequence_groups = column.index.to_series().groupby(sequence_indexes) # Group series indexes by sequence indexes data = {'length': sequence_groups.count().values, @@ -102,7 +102,6 @@ def predefined_valued_missing_values(self, skip: list=[], short: bool = True): Raises warning based on the existence of these values. VMVs of string type are case insensitive during search. Returns a DataFrame with count distribution for each predefined type over each column. - The result DataFrame will ommit any Arguments: skip: List of columns that will not be target of search for vmvs. Pass '__index' in skip to skip looking for flatlines at the index. @@ -122,7 +121,7 @@ def predefined_valued_missing_values(self, skip: list=[], short: bool = True): vmvs.drop(no_vmv_rows, inplace=True) if vmvs.empty: print("[PREDEFINED VALUED MISSING VALUES] No predefined vmvs from the set {} were found in the dataset.".format( - self.predefined_valued_missing_values + self.vmvs )) else: total_vmvs = vmvs.sum().sum() @@ -131,4 +130,4 @@ def predefined_valued_missing_values(self, skip: list=[], short: bool = True): test='Predefined Valued Missing Values', category='Valued Missing Values', priority=2, data=vmvs, description=f"Found {total_vmvs} vmvs in the dataset." )) - return vmvs \ No newline at end of file + return vmvs diff --git a/tutorials/data_quality.ipynb b/tutorials/data_quality.ipynb new file mode 100644 index 00000000..c129332f --- /dev/null +++ b/tutorials/data_quality.ipynb @@ -0,0 +1,231 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# YData Quality - DataQuality Tutorial\n", + "Time-to-Value: 4 minutes\n", + "\n", + "This notebook provides a tutorial to run the `ydata_quality.DataQuality` main class that aggregates all the individual data quality engines, each focused on a main topic of data quality (e.g. duplicates, missing values).\n", + "\n", + "**Structure:**\n", + "\n", + "1. Load dataset\n", + "2. Distort dataset\n", + "3. Instantiate the Data Quality engine\n", + "4. Run the quality checks\n", + "5. Assess the warnings\n", + "6. (Extra) Detailed overview" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import statsmodels.api as sm\n", + "import pandas as pd\n", + "import numpy as np\n", + "from ydata_quality import DataQuality" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the example dataset\n", + "We will use a dataset available from the statsmodels package." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df_original = sm.datasets.get_rdataset('Guerry', 'HistData').data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distort the original dataset\n", + "Apply transformations to highlight the data quality functionalities." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def apply_quality_transformations(df: pd.DataFrame):\n", + " \"Force data quality issues to highlight functionality.\"\n", + " # Copy to guarantee the original is kept intact\n", + " df = df.copy()\n", + "\n", + " # Duplicates\n", + " df = df.append(df[:20], ignore_index=True)\n", + " df[\"dept2\"] = df[\"dept\"]\n", + " return df\n", + "\n", + "df = apply_quality_transformations(df_original)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the main engine\n", + "The DataQuality class aggregates all the individual data quality engines, each focused on a main topic of data quality (e.g. duplicates, missing values). To create a DataQuality object, you provide:\n", + "- df: target DataFrame, for which we will run the test suite\n", + "- target (optional): target feature to be predicted in a supervised learning context\n", + "- entities (optional): list of feature names for which checking duplicates after grouping-by is applicable.\n", + "- vmv_extensions (optional): list of valued missing value to append to the defaults." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "VMV_EXTENSIONS = ['a_custom_VMV', 'another_VMV', 999999999, '!', '', 'UNKNOWN']" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dq = DataQuality(df=df, label='Pop1831', vmv_extensions=VMV_EXTENSIONS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Full Evaluation\n", + "The easiest way to assess the data quality analysis is to run `.evaluate()` which returns a list of warnings for each quality check. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ENTITY DUPLICATES] There are no entities defined to run the analysis. Skipping the test.\n", + "WARNING: Skipping test predict_missings due to failure during computation.\n", + "[PREDEFINED VALUED MISSING VALUES] No predefined vmvs from the set {'', '(blank)', 'na', 'unknown', 'unk', 'n/a', 'another_vmv', 'a_custom_vmv', '?', '!', 999999999} were found in the dataset.\n", + "WARNING: Skipping test ref_covariate_drift due to failure during computation.\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[MISSING LABELS] No missing labels were found.\n", + "[TEST NORMALITY] It was not possible to normalize the label values. See the warning message for additional context.\n" + ] + } + ], + "source": [ + "full_results = dq.evaluate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check the status\n", + "After running the data quality checks, you can check the warnings for each individual test. The warnings are suited by priority and have additional details that can provide better insights for Data Scientists." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TEST NORMALITY] The label distribution failed to pass a normality test as-is and following a battery of transforms. \n", + "\tIt is possible that the data originates from an exotic distribution, there is heavy outlier presence or it is multimodal. \n", + "\tAddressing this issue might prove critical for regressor performance. (Priority 1: heavy impact expected)\n", + "[DUPLICATE COLUMNS] Found 1 columns with exactly the same feature values as other columns. (Priority 1: heavy impact expected)\n", + "[EXACT DUPLICATES] Found 20 instances with exact duplicate feature values. (Priority 2: usage allowed, limited human intelligibility)\n", + "[FLATLINES] Found 8 flatline events with a minimun length of 5 among the columns {'Region', 'MainCity'}. (Priority 2: usage allowed, limited human intelligibility)\n", + "[OUTLIER DETECTION] Found 2 potential outliers across the full dataset. \n", + "\tA distance bigger than 3.0 standard deviations of intra-cluster distances to the respective centroids was used to define the potential outliers. (Priority 2: usage allowed, limited human intelligibility)\n" + ] + } + ], + "source": [ + "# Print the overall status of Data Quality discovered during evaluation\n", + "dq.report()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# You can retrieve the full list of warnings or filtered by specific conditions\n", + "data_quality_warnings = dq.get_warnings()\n", + "duplicate_quality_warnings = dq.get_warnings(category='Duplicates')\n", + "priority_2_warnings = dq.get_warnings(priority=2)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "fdb8905eeefe08da097059bda365f0d7e393b9cc818106eee5be3ebd28cc5e41" + }, + "kernelspec": { + "display_name": "Python 3.8.10 ('.venv': venv)", + "name": "pythonjvsc74a57bd0cdc2bce73c2a9ac283f602628cabf735dbe06c4ee87a7849fc5f3d1177c8f304" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "metadata": { + "interpreter": { + "hash": "cdc2bce73c2a9ac283f602628cabf735dbe06c4ee87a7849fc5f3d1177c8f304" + } + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file