From 2eb03eb9988b500d7b66f2a25e1aa33a8512467b Mon Sep 17 00:00:00 2001 From: UrbanoFonseca Date: Tue, 29 Jun 2021 23:53:27 +0100 Subject: [PATCH 1/2] feat(core): added warnings, engine --- src/__init__.py | 0 src/ydata_quality/__init__.py | 5 ++ src/ydata_quality/core/__init__.py | 11 +++ src/ydata_quality/core/data_quality.py | 20 ++++++ src/ydata_quality/core/engine.py | 40 +++++++++++ src/ydata_quality/core/warnings.py | 96 ++++++++++++++++++++++++++ 6 files changed, 172 insertions(+) create mode 100644 src/__init__.py create mode 100644 src/ydata_quality/__init__.py create mode 100644 src/ydata_quality/core/__init__.py create mode 100644 src/ydata_quality/core/data_quality.py create mode 100644 src/ydata_quality/core/engine.py create mode 100644 src/ydata_quality/core/warnings.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ydata_quality/__init__.py b/src/ydata_quality/__init__.py new file mode 100644 index 00000000..886255b2 --- /dev/null +++ b/src/ydata_quality/__init__.py @@ -0,0 +1,5 @@ +""" +YData open-source lib for Data Quality. +""" + +from .version import __version__ diff --git a/src/ydata_quality/core/__init__.py b/src/ydata_quality/core/__init__.py new file mode 100644 index 00000000..b09e9065 --- /dev/null +++ b/src/ydata_quality/core/__init__.py @@ -0,0 +1,11 @@ +""" +Core functionality for Data Quality analysis. +""" + +from ydata_quality.core.warnings import QualityWarning +from ydata_quality.core.engine import QualityEngine + +__all__ = [ + "QualityWarning", + "QualityEngine" +] diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py new file mode 100644 index 00000000..680c2a5c --- /dev/null +++ b/src/ydata_quality/core/data_quality.py @@ -0,0 +1,20 @@ +""" +Implementation of main class for Data Quality checks. +""" + +import pandas as pd + +class DataQuality: + "DataQuality gathers the multiple data quality engines." + + def __init__(self, df: pd.DataFrame): + self.df = df + + def evaluate(self): + "Runs all the individual data quality checks and aggregates the results." + raise NotImplementedError + + + def report(self): + "Returns a full list of warnings retrieved during the Data Quality checks." + raise NotImplementedError diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py new file mode 100644 index 00000000..ffd17c2d --- /dev/null +++ b/src/ydata_quality/core/engine.py @@ -0,0 +1,40 @@ +""" +Implementation of abstract class for Data Quality engines. +""" +from abc import ABC +import pandas as pd + +class QualityEngine(ABC): + "Main class for running and storing data quality analysis." + + def __init__(self, df: pd.DataFrame): + self._df = df + self._warnings = set() + self._tests = [] + + @property + def df(self): + "Target of data quality checks." + return self._df + + @property + def warnings(self): + "Storage of all detected data quality warnings." + return self._warnings + + @property + def tests(self): + "List of individual tests available for the data quality checks." + return self._tests + + def report(self): + "Prints a report containing all the warnings detected during the data quality analysis." + # TODO: Provide a count of warnings by priority + self._warnings = set(sorted(self._warnings)) # Sort the warnings by priority + for warn in self.warnings: + print(warn) + + def evaluate(self): + "Runs all the indidividual tests available within the same suite. Returns a dict of (name: results)." + self._warnings = set() # reset the warnings to avoid duplicates + return {test: getattr(self, test)() for test in self.tests} diff --git a/src/ydata_quality/core/warnings.py b/src/ydata_quality/core/warnings.py new file mode 100644 index 00000000..004e623e --- /dev/null +++ b/src/ydata_quality/core/warnings.py @@ -0,0 +1,96 @@ +""" +Definition of a data quality warning. +""" + +from typing import Any + +from pydantic import BaseModel +from ydata_quality.utils.enum import OrderedEnum + + +class Priority(OrderedEnum): + """Priorities translate the expected impact of data quality issues. + + Priorities: + P0: blocks using the dataset + P1: heavy impact expected + P2: allows usage but may block human-intelligible insights + P3: minor impact, aesthetic + """ + P0 = 0 + P1 = 1 + P2 = 2 + P3 = 3 + + def __str__(self): + "Priority {value}: {long description}" + _descriptions = { + 0: 'blocks using the dataset', + 1: 'heavy impact expected', + 2: 'usage allowed, limited human intelligibility', + 3: 'minor impact, aesthetic' + } + return f"Priority {self.value}: {_descriptions[self.value]}" + + +class QualityWarning(BaseModel): + """ Details for issues detected during data quality analysis. + + category: name of the test suite (e.g. 'Exact Duplicates') + test: name of the individual test + description: long-text description of the results + priority: expected impact of data quality issue + data: sample data + """ + + category: str + test: str + description: str + priority: Priority + data: Any = None + + ######################### + # String Representation # + ######################### + def __str__(self): + return f"[{self.test.upper()}] {self.description} ({str(self.priority)})" + + ######################## + # Comparison Operators # + ######################## + def __ge__(self, other): + if self.__class__ is other.__class__: + return self.priority >= other.priority + return NotImplemented + def __gt__(self, other): + if self.__class__ is other.__class__: + return self.priority > other.priority + return NotImplemented + def __le__(self, other): + if self.__class__ is other.__class__: + return self.priority <= other.priority + return NotImplemented + def __lt__(self, other): + if self.__class__ is other.__class__: + return self.priority < other.priority + return NotImplemented + + ########################## + # Hashable Definition # + ########################## + + def __hash__(self): + # Hashable definition is needed for storing the elements in a set. + return hash((self.category, self.test, self.description, self.priority)) + + def __eq__(self, other): + if self.__class__ is other.__class__: + return all( + ( + self.category == other.category, + self.test == other.test, + self.description == other.description, + self.priority == other.priority + ) + ) + return NotImplemented From 1042def4238cc37174e97cac122a7782476a3afb Mon Sep 17 00:00:00 2001 From: UrbanoFonseca Date: Tue, 29 Jun 2021 23:58:23 +0100 Subject: [PATCH 2/2] feat(utils): added OrderedEnum --- src/ydata_quality/utils/enum.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 src/ydata_quality/utils/enum.py diff --git a/src/ydata_quality/utils/enum.py b/src/ydata_quality/utils/enum.py new file mode 100644 index 00000000..f6bcb868 --- /dev/null +++ b/src/ydata_quality/utils/enum.py @@ -0,0 +1,24 @@ +""" +Custom implementations of Enums. +""" + +from enum import Enum + +class OrderedEnum(Enum): + "Enum with support for ordering." + def __ge__(self, other): + if self.__class__ is other.__class__: + return self.value >= other.value + return NotImplemented + def __gt__(self, other): + if self.__class__ is other.__class__: + return self.value > other.value + return NotImplemented + def __le__(self, other): + if self.__class__ is other.__class__: + return self.value <= other.value + return NotImplemented + def __lt__(self, other): + if self.__class__ is other.__class__: + return self.value < other.value + return NotImplemented