generated from ydataai/opensource-template
-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(engine): added main engine mvp (#7)
Features: - Run automatically whole bundle of available engines - Graceful errors of individual tests (core)
- Loading branch information
UrbanoFonseca
authored
Jul 25, 2021
1 parent
7e54a2d
commit 41993ca
Showing
7 changed files
with
340 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,11 @@ | ||
""" | ||
YData open-source lib for Data Quality. | ||
""" | ||
from ydata_quality.core.data_quality import DataQuality | ||
|
||
from .version import __version__ | ||
|
||
|
||
__all__ = [ | ||
"DataQuality" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,98 @@ | ||
""" | ||
Implementation of main class for Data Quality checks. | ||
""" | ||
from typing import List, Union, Optional, Callable | ||
|
||
import pandas as pd | ||
|
||
from ydata_quality.core.warnings import QualityWarning, Priority | ||
from ydata_quality.duplicates import DuplicateChecker | ||
from ydata_quality.labelling import LabelInspector | ||
from ydata_quality.missings import MissingsProfiler | ||
from ydata_quality.valued_missing_values import VMVIdentifier | ||
from ydata_quality.drift import DriftAnalyser | ||
|
||
class DataQuality: | ||
"DataQuality gathers the multiple data quality engines." | ||
"DataQuality contains the multiple data quality engines." | ||
|
||
def __init__(self, | ||
df: pd.DataFrame, | ||
label: str = None, | ||
entities: List[Union[str, List[str]]] = [], | ||
vmv_extensions: Optional[list]=[], | ||
sample: Optional[pd.DataFrame] = None, | ||
model: Callable = None | ||
): | ||
""" | ||
Engines: | ||
- Duplicates | ||
- Missing Values | ||
- Labelling | ||
- Valued Missing Values | ||
- Drift Analysis | ||
def __init__(self, df: pd.DataFrame): | ||
Args: | ||
df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis. | ||
label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted. | ||
If not specified, LABELLING is skipped. | ||
entities: [DUPLICATES] entities relevant for duplicate analysis. | ||
vmv_extensions: [VALUED MISSING VALUES] A list of user provided valued missing values to append to defaults. | ||
sample: [DRIFT ANALYSIS] data against which drift is tested. | ||
model: [DRIFT ANALYSIS] model wrapped by ModelWrapper used to test concept drift. | ||
""" | ||
self.df = df | ||
self._warnings = set() | ||
self._engines = { # Default list of engines | ||
'duplicates': DuplicateChecker(df=df, entities=entities), | ||
'missings': MissingsProfiler(df=df, target=label), | ||
'valued-missing-values': VMVIdentifier(df=df, vmv_extensions=vmv_extensions), | ||
'drift-analysis': DriftAnalyser(ref=df, sample=sample, label=label, model=model) | ||
} | ||
|
||
# Engines based on mandatory arguments | ||
if label is not None: | ||
self._engines['labelling'] = LabelInspector(df=df, label=label) | ||
else: | ||
print('Label is not defined. Skipping LABELLING engine.') | ||
|
||
|
||
@property | ||
def warnings(self): | ||
"Set of warnings generated by individual QualityEngines." | ||
return self._warnings | ||
|
||
def get_warnings(self, | ||
category: Optional[str] = None, | ||
test: Optional[str] = None, | ||
priority: Optional[Priority] = None) -> List[QualityWarning]: | ||
"Retrieves warnings filtered by their properties." | ||
filtered = list(self.warnings) # convert original set | ||
filtered = [w for w in filtered if w.category == category] if category else filtered | ||
filtered = [w for w in filtered if w.test == test] if test else filtered | ||
filtered = [w for w in filtered if w.priority == Priority(priority)] if priority else filtered | ||
filtered.sort() # sort by priority | ||
return filtered | ||
|
||
@property | ||
def engines(self): | ||
"Dictionary of instantiated engines to run data quality analysis." | ||
return self._engines | ||
|
||
def __store_warnings(self): | ||
"Appends all warnings from individiual engines into warnings of DataQuality main class." | ||
for engine in self.engines.values(): | ||
self._warnings = self._warnings.union(set(engine.get_warnings())) | ||
|
||
def evaluate(self): | ||
"Runs all the individual data quality checks and aggregates the results." | ||
raise NotImplementedError | ||
|
||
results = {name: engine.evaluate() for name, engine in self.engines.items()} | ||
self.__store_warnings() | ||
return results | ||
|
||
def report(self): | ||
"Returns a full list of warnings retrieved during the Data Quality checks." | ||
raise NotImplementedError | ||
"Prints a report containing all the warnings detected during the data quality analysis." | ||
# TODO: Provide a count of warnings by priority | ||
self.__store_warnings() # fetch all warnings from the engines | ||
for warn in self.get_warnings(): | ||
print(warn) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.