Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core): added warnings, engine #1

Merged
merged 2 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added src/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions src/ydata_quality/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
YData open-source lib for Data Quality.
"""

from .version import __version__
11 changes: 11 additions & 0 deletions src/ydata_quality/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""
Core functionality for Data Quality analysis.
"""

from ydata_quality.core.warnings import QualityWarning
from ydata_quality.core.engine import QualityEngine

__all__ = [
"QualityWarning",
"QualityEngine"
]
20 changes: 20 additions & 0 deletions src/ydata_quality/core/data_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""
Implementation of main class for Data Quality checks.
"""

import pandas as pd

class DataQuality:
"DataQuality gathers the multiple data quality engines."

def __init__(self, df: pd.DataFrame):
self.df = df

def evaluate(self):
"Runs all the individual data quality checks and aggregates the results."
raise NotImplementedError


def report(self):
"Returns a full list of warnings retrieved during the Data Quality checks."
raise NotImplementedError
40 changes: 40 additions & 0 deletions src/ydata_quality/core/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Implementation of abstract class for Data Quality engines.
"""
from abc import ABC
import pandas as pd

class QualityEngine(ABC):
"Main class for running and storing data quality analysis."

def __init__(self, df: pd.DataFrame):
self._df = df
self._warnings = set()
self._tests = []

@property
def df(self):
"Target of data quality checks."
return self._df

@property
def warnings(self):
"Storage of all detected data quality warnings."
return self._warnings

@property
def tests(self):
"List of individual tests available for the data quality checks."
return self._tests

def report(self):
"Prints a report containing all the warnings detected during the data quality analysis."
# TODO: Provide a count of warnings by priority
self._warnings = set(sorted(self._warnings)) # Sort the warnings by priority
for warn in self.warnings:
print(warn)

def evaluate(self):
"Runs all the indidividual tests available within the same suite. Returns a dict of (name: results)."
self._warnings = set() # reset the warnings to avoid duplicates
return {test: getattr(self, test)() for test in self.tests}
96 changes: 96 additions & 0 deletions src/ydata_quality/core/warnings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
Definition of a data quality warning.
"""

from typing import Any

from pydantic import BaseModel
from ydata_quality.utils.enum import OrderedEnum


class Priority(OrderedEnum):
"""Priorities translate the expected impact of data quality issues.

Priorities:
P0: blocks using the dataset
P1: heavy impact expected
P2: allows usage but may block human-intelligible insights
P3: minor impact, aesthetic
"""
P0 = 0
P1 = 1
P2 = 2
P3 = 3

def __str__(self):
"Priority {value}: {long description}"
_descriptions = {
0: 'blocks using the dataset',
1: 'heavy impact expected',
2: 'usage allowed, limited human intelligibility',
3: 'minor impact, aesthetic'
}
return f"Priority {self.value}: {_descriptions[self.value]}"


class QualityWarning(BaseModel):
""" Details for issues detected during data quality analysis.

category: name of the test suite (e.g. 'Exact Duplicates')
test: name of the individual test
description: long-text description of the results
priority: expected impact of data quality issue
data: sample data
"""

category: str
test: str
description: str
priority: Priority
data: Any = None

#########################
# String Representation #
#########################
def __str__(self):
return f"[{self.test.upper()}] {self.description} ({str(self.priority)})"

########################
# Comparison Operators #
########################
def __ge__(self, other):
if self.__class__ is other.__class__:
return self.priority >= other.priority
return NotImplemented
def __gt__(self, other):
if self.__class__ is other.__class__:
return self.priority > other.priority
return NotImplemented
def __le__(self, other):
if self.__class__ is other.__class__:
return self.priority <= other.priority
return NotImplemented
def __lt__(self, other):
if self.__class__ is other.__class__:
return self.priority < other.priority
return NotImplemented

##########################
# Hashable Definition #
##########################

def __hash__(self):
# Hashable definition is needed for storing the elements in a set.
return hash((self.category, self.test, self.description, self.priority))

def __eq__(self, other):
if self.__class__ is other.__class__:
return all(
(
self.category == other.category,
self.test == other.test,
self.description == other.description,
self.priority == other.priority
)
)
return NotImplemented
24 changes: 24 additions & 0 deletions src/ydata_quality/utils/enum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Custom implementations of Enums.
"""

from enum import Enum

class OrderedEnum(Enum):
"Enum with support for ordering."
def __ge__(self, other):
if self.__class__ is other.__class__:
return self.value >= other.value
return NotImplemented
def __gt__(self, other):
if self.__class__ is other.__class__:
return self.value > other.value
return NotImplemented
def __le__(self, other):
if self.__class__ is other.__class__:
return self.value <= other.value
return NotImplemented
def __lt__(self, other):
if self.__class__ is other.__class__:
return self.value < other.value
return NotImplemented