Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add loggers and change prints to logs #27

Merged
merged 11 commits into from
Sep 22, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions src/ydata_quality/bias_fairness/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,16 @@ class BiasFairness(QualityEngine):
"""

def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None,
random_state: Optional[int] = None):
random_state: Optional[int] = None, severity: Optional[str]= None):
"""
Args
df (pd.DataFrame): reference DataFrame used to run the analysis
sensitive_features (List[str]): features deemed as sensitive attributes
label (str, optional): target feature to be predicted
severity (str, optional): Sets the logger warning threshold to one of the valid levels
[DEBUG, INFO, WARNING, ERROR, CRITICAL]
"""
super().__init__(df=df, label=label, random_state=random_state)
super().__init__(df=df, label=label, random_state=random_state, severity=severity)
self._sensitive_features = sensitive_features
self._tests = ["performance_discrimination", "proxy_identification",
"sensitive_predictability", "sensitive_representativity"]
Expand Down Expand Up @@ -72,7 +74,7 @@ def sensitive_predictability(self, th=0.5, adjusted_metric=True):
performances = pd.Series(index=self.sensitive_features)
for feat in performances.index:
data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target
performances[feat] = baseline_performance(df=data, target=feat, adjusted_metric=adjusted_metric)
performances[feat] = baseline_performance(df=data, label=feat, adjusted_metric=adjusted_metric)

high_perfs = performances[performances>th]
if len(high_perfs) > 0:
Expand All @@ -93,12 +95,11 @@ def performance_discrimination(self):
"""
# TODO: support error rate parity metrics (e.g. false positive rate, positive rate)
if self.label is None:
print('[BIAS&FAIRNESS] Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
pass
self._logger.warning('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')

res = {}
for feat in self.sensitive_features:
res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, target=self.label))
res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, label=self.label))
return res


Expand Down
38 changes: 22 additions & 16 deletions src/ydata_quality/core/data_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ydata_quality.data_expectations import DataExpectationsReporter
from ydata_quality.bias_fairness import BiasFairness
from ydata_quality.data_relations import DataRelationsDetector
from ydata_quality.utils.logger import get_logger, NAME

class DataQuality:
"DataQuality contains the multiple data quality engines."
Expand All @@ -37,8 +38,8 @@ def __init__(self,
corr_th: float = 0.8,
vif_th: float = 5,
p_th: float = 0.05,
plot: bool = True
):
plot: bool = True,
severity: str= 'ERROR'):
"""
Engines:
- Duplicates
Expand All @@ -55,7 +56,7 @@ def __init__(self,
label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted.
If not specified, LABELLING is skipped.
random_state (int, optional): Integer seed for random reproducibility. Default is None.
Set to None for fully random behaviour, no reproducibility.
Set to None for fully random behavior, no reproducibility.
entities: [DUPLICATES] entities relevant for duplicate analysis.
is_close: [DUPLICATES] Pass True to use numpy.isclose instead of pandas.equals in column comparison.
ed_extensions: [ERRONEOUS DATA] A list of user provided erroneous data values to append to defaults.
Expand All @@ -71,19 +72,22 @@ def __init__(self,
vif_th (float): [DATA RELATIONS] Variance Inflation Factor threshold for numerical independence test, typically 5-10 is recommended. Defaults to 5.
p_th (float): [DATA RELATIONS] Fraction of the right tail of the chi squared CDF defining threshold for categorical independence test. Defaults to 0.05.
plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
severity (str): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]
"""
#TODO: Refactor legacy engines (property based) and logic in this class to new base (lean objects)
self.df = df
self._warnings = list()
self._logger = get_logger(NAME, level=severity)
self._random_state = random_state

self._engines_legacy = { # Default list of engines
'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close),
'missings': MissingsProfiler(df=df, target=label, random_state=self.random_state),
'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions),
'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state)
'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close, severity=severity),
'missings': MissingsProfiler(df=df, label=label, random_state=self.random_state, severity=severity),
'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions, severity=severity),
'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state, severity=severity)
}

self._engines_new = {'data-relations': DataRelationsDetector()}
self._engines_new = {'data-relations': DataRelationsDetector(severity=severity)}
self._eval_args = { # Argument lists for different engines
# TODO: centralize shared args in a dictionary to pass just like a regular kwargs to engines, pass specific args in arg list (define here)
# In new standard all engines can be run at the evaluate method only, the evaluate run expression can then be:
Expand All @@ -94,18 +98,20 @@ def __init__(self,

# Engines based on mandatory arguments
if label is not None:
self._engines_legacy['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state)
self._engines_legacy['labelling'] = LabelInspector(df=df, label=label,
random_state=self.random_state, severity=severity)
else:
print('Label is not defined. Skipping LABELLING engine.')
self._logger.warning('Label is not defined. Skipping LABELLING engine.')
if len(sensitive_features)>0:
self._engines_legacy['bias&fairness'] = BiasFairness(df=df, sensitive_features=sensitive_features,
label=label, random_state=self.random_state)
label=label, random_state=self.random_state,
severity=severity)
else:
print('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
self._logger.warning('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
if results_json_path is not None:
self._engines_new['expectations'] = DataExpectationsReporter()
self._engines_new['expectations'] = DataExpectationsReporter(severity=severity)
else:
print('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')
self._logger.warning('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')


def __clean_warnings(self):
Expand Down Expand Up @@ -140,7 +146,7 @@ def random_state(self, new_state):
if new_state==None or (isinstance(new_state, int) and new_state>=0):
self._random_state = new_state
else:
print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
self._random_state = None

def __store_warnings(self):
Expand All @@ -158,7 +164,7 @@ def report(self):
self.__store_warnings() # fetch all warnings from the engines
self.__clean_warnings()
if not self._warnings:
print('No warnings found.')
self._logger.info('No warnings found.')
else:
prio_counts = Counter([warn.priority.value for warn in self._warnings])
print('Warnings count by priority:')
Expand Down
29 changes: 17 additions & 12 deletions src/ydata_quality/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,17 @@

from ydata_quality.core.warnings import Priority, QualityWarning
from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes
from ydata_quality.utils.enum import DataFrameType
from ydata_quality.utils.logger import get_logger, NAME


class QualityEngine(ABC):
"Main class for running and storing data quality analysis."

def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None):
def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None, severity: Optional[str]= None):
self._df = df
self._df_type = None
self._warnings = list()
self._logger = get_logger(NAME, level=severity)
self._tests = []
self._label = label
self._dtypes = dtypes
Expand All @@ -37,9 +38,8 @@ def label(self):

@label.setter
def label(self, label: str):
if not isinstance(label, str):
raise ValueError("Property 'label' should be a string.")
assert label in self.df.columns, "Given label should exist as a DataFrame column."
assert isinstance(label, str), "Property 'label' should be a string."
assert label in self.df.columns, "Provided label %s does not exist as a DataFrame column." % label
self._label = label

@property
Expand All @@ -52,11 +52,16 @@ def dtypes(self):
@dtypes.setter
def dtypes(self, dtypes: dict):
if not isinstance(dtypes, dict):
raise ValueError("Property 'dtypes' should be a dictionary.")
assert all(col in self.df.columns for col in dtypes), "All dtypes keys must be columns in the dataset."
self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
dtypes = {}
cols_not_in_df = [col for col in dtypes if col not in self.df.columns]
if len(cols_not_in_df) > 0:
self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
supported_dtypes = ['numerical', 'categorical']
assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes must be in the supported \
broad dtype list: {}.".format(supported_dtypes)
wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
if len(wrong_dtypes>0):
self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
df_col_set = set(self.df.columns)
dtypes_col_set = set(dtypes.keys())
missing_cols = df_col_set.difference(dtypes_col_set)
Expand Down Expand Up @@ -85,7 +90,7 @@ def random_state(self, new_state):
self._random_state = new_state
random.seed(self.random_state)
except:
print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
self._random_state = None

def __clean_warnings(self):
Expand Down Expand Up @@ -116,7 +121,7 @@ def report(self):
"Prints a report containing all the warnings detected during the data quality analysis."
self.__clean_warnings()
if not self._warnings:
print('No warnings found.')
self._logger.info('No warnings found.')
else:
prio_counts = Counter([warn.priority.value for warn in self._warnings])
print('Warnings count by priority:')
Expand All @@ -133,6 +138,6 @@ def evaluate(self):
try: # if anything fails
results[test] = getattr(self, test)()
except Exception as exc: # print a Warning and log the message
print(f'WARNING: Skipping test {test} due to failure during computation.')
self._logger.warning('Skipping %s due to failure during computation. See results folder of this test for further details.', test)
results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}"
return results
12 changes: 7 additions & 5 deletions src/ydata_quality/data_expectations/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,18 @@

from ydata_quality.core import QualityEngine, QualityWarning
from ydata_quality.utils.auxiliary import test_load_json_path
from ydata_quality.utils.logger import get_logger, NAME


class DataExpectationsReporter(QualityEngine):
"""Main class to run data expectation validation analysis.
Supports standard Great Expectations json reports from expectation suite validation runs.
"""

def __init__(self):
return # Override the base class init method
def __init__(self, severity: Optional[str]= None): # Overrides base class init
"severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]"
self._warnings = [] # reset the warnings to avoid duplicates
self._logger = get_logger(NAME, level=severity)

@property
def tests(self):
Expand Down Expand Up @@ -175,18 +178,17 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
rel_error_tol (float): Defines the maximum fraction of failed expectations, overrides error_tol.
minimum_coverage (float): Minimum expected fraction of DataFrame columns covered by the expectation suite.
"""
self._warnings = list() # reset the warnings to avoid duplicates
df = df if isinstance(df, pd.DataFrame) else None
results = {}
if df is not None:
try: # if anything fails
results['Coverage Fraction'] = self._coverage_fraction(
results_json_path, df, minimum_coverage=minimum_coverage)
except AssertionError as exc: # print a Warning and log the message
print("['DATA EXPECTATIONS'] Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
self._logger.critical("Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
return "[ERROR] Canceled computation. Original exception: "+f"{exc}"
portellaa marked this conversation as resolved.
Show resolved Hide resolved
else:
print("A valid DataFrame was not passed, skipping coverage fraction test.")
self._logger.error("A valid DataFrame was not passed, skipping coverage fraction test.")
results['Overall Assessment'] = self._overall_assessment(results_json_path, error_tol, rel_error_tol)
results['Expectation Level Assessment'] = self._expectation_level_assessment(results_json_path)
return results
26 changes: 15 additions & 11 deletions src/ydata_quality/data_relations/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
from ydata_quality.core import QualityEngine, QualityWarning
from ydata_quality.utils.correlations import correlation_matrix, partial_correlation_matrix, correlation_plotter, vif_collinearity, chi2_collinearity
from ydata_quality.utils.auxiliary import infer_dtypes, standard_normalize
from ydata_quality.utils.logger import get_logger, NAME

class DataRelationsDetector(QualityEngine):
"""Main class to run data relations analysis.
"""

def __init__(self):
return # Override the base class init method
def __init__(self, severity: Optional[str] = None): # Overrides base class init
"severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]"
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
self._warnings = [] # reset the warnings to avoid duplicates
self._logger = get_logger(NAME, level=severity)

@property
def tests(self):
Expand All @@ -29,12 +32,16 @@ def dtypes(self):
def dtypes(self, df_dtypes: Tuple[pd.DataFrame, dict]):
df, dtypes = df_dtypes
if not isinstance(dtypes, dict):
raise ValueError("Property 'dtypes' should be a dictionary.")
assert all(col in df.columns for col in dtypes), "All dtypes keys \
must be columns in the dataset."
self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
dtypes = {}
cols_not_in_df = [col for col in dtypes if col not in df.columns]
if len(cols_not_in_df) > 0:
self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
supported_dtypes = ['numerical', 'categorical']
assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes\
must be in the supported broad dtype list: {}.".format(supported_dtypes)
wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
if len(wrong_dtypes)>0:
self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
df_col_set = set(df.columns)
dtypes_col_set = set(dtypes.keys())
missing_cols = df_col_set.difference(dtypes_col_set)
Expand All @@ -60,9 +67,6 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
"""
assert label in df.columns or not label, "The provided label name does not exist as a column in the dataset"
self._warnings = [] # reset the warnings to avoid duplicates
if not dtypes:
dtypes = {}
self.dtypes = (df, dtypes) # Consider refactoring QualityEngine dtypes (df as argument of setter)
df = standard_normalize(df, dtypes)
results = {}
Expand All @@ -77,7 +81,7 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
results['Confounders'] = self._confounder_detection(corr_mat, p_corr_mat, corr_th)
results['Colliders'] = self._collider_detection(corr_mat, p_corr_mat, corr_th)
else:
print('[DATA RELATIONS] The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
self._logger.warning('The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
if label:
results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th)
Expand Down
Loading