Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add loggers and change prints to logs #27

Merged
merged 11 commits into from
Sep 22, 2021
Merged
3 changes: 1 addition & 2 deletions src/ydata_quality/bias_fairness/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ def performance_discrimination(self):
"""
# TODO: support error rate parity metrics (e.g. false positive rate, positive rate)
if self.label is None:
print('[BIAS&FAIRNESS] Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
pass
self._logger.warning('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')

res = {}
for feat in self.sensitive_features:
Expand Down
17 changes: 10 additions & 7 deletions src/ydata_quality/core/data_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ydata_quality.data_expectations import DataExpectationsReporter
from ydata_quality.bias_fairness import BiasFairness
from ydata_quality.data_relations import DataRelationsDetector
from ydata_quality.utils.logger import *
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved

class DataQuality:
"DataQuality contains the multiple data quality engines."
Expand Down Expand Up @@ -55,7 +56,7 @@ def __init__(self,
label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted.
If not specified, LABELLING is skipped.
random_state (int, optional): Integer seed for random reproducibility. Default is None.
Set to None for fully random behaviour, no reproducibility.
Set to None for fully random behavior, no reproducibility.
entities: [DUPLICATES] entities relevant for duplicate analysis.
is_close: [DUPLICATES] Pass True to use numpy.isclose instead of pandas.equals in column comparison.
ed_extensions: [ERRONEOUS DATA] A list of user provided erroneous data values to append to defaults.
Expand All @@ -75,10 +76,12 @@ def __init__(self,
#TODO: Refactor legacy engines (property based) and logic in this class to new base (lean objects)
self.df = df
self._warnings = list()
self._logger = create_logger(NAME, STREAM, LOG_LEVEL)
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
self._random_state = random_state

self._engines_legacy = { # Default list of engines
'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close),
'missings': MissingsProfiler(df=df, target=label, random_state=self.random_state),
'missings': MissingsProfiler(df=df, label=label, random_state=self.random_state),
'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions),
'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state)
}
Expand All @@ -96,16 +99,16 @@ def __init__(self,
if label is not None:
self._engines_legacy['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state)
else:
print('Label is not defined. Skipping LABELLING engine.')
self._logger.warning('Label is not defined. Skipping LABELLING engine.')
if len(sensitive_features)>0:
self._engines_legacy['bias&fairness'] = BiasFairness(df=df, sensitive_features=sensitive_features,
label=label, random_state=self.random_state)
else:
print('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
self._logger.warning('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
if results_json_path is not None:
self._engines_new['expectations'] = DataExpectationsReporter()
else:
print('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')
self._logger.warning('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')


def __clean_warnings(self):
Expand Down Expand Up @@ -140,7 +143,7 @@ def random_state(self, new_state):
if new_state==None or (isinstance(new_state, int) and new_state>=0):
self._random_state = new_state
else:
print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
self._random_state = None

def __store_warnings(self):
Expand All @@ -158,7 +161,7 @@ def report(self):
self.__store_warnings() # fetch all warnings from the engines
self.__clean_warnings()
if not self._warnings:
print('No warnings found.')
self._logger.info('No warnings found.')
else:
prio_counts = Counter([warn.priority.value for warn in self._warnings])
print('Warnings count by priority:')
Expand Down
26 changes: 16 additions & 10 deletions src/ydata_quality/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ydata_quality.core.warnings import Priority, QualityWarning
from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes
from ydata_quality.utils.enum import DataFrameType
from ydata_quality.utils.logger import *


class QualityEngine(ABC):
Expand All @@ -20,6 +21,7 @@ def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label:
self._df = df
self._df_type = None
self._warnings = list()
self._logger = create_logger(NAME, STREAM, LOG_LEVEL)
self._tests = []
self._label = label
self._dtypes = dtypes
Expand All @@ -37,9 +39,8 @@ def label(self):

@label.setter
def label(self, label: str):
if not isinstance(label, str):
raise ValueError("Property 'label' should be a string.")
assert label in self.df.columns, "Given label should exist as a DataFrame column."
assert isinstance(label, str), "Property 'label' should be a string."
assert label in self.df.columns, "Provided label %s does not exist as a DataFrame column." % label
self._label = label

@property
Expand All @@ -52,11 +53,16 @@ def dtypes(self):
@dtypes.setter
def dtypes(self, dtypes: dict):
if not isinstance(dtypes, dict):
raise ValueError("Property 'dtypes' should be a dictionary.")
assert all(col in self.df.columns for col in dtypes), "All dtypes keys must be columns in the dataset."
self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
dtypes = {}
cols_not_in_df = [col for col in dtypes if col not in self.df.columns]
if len(cols_not_in_df) > 0:
self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
supported_dtypes = ['numerical', 'categorical']
assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes must be in the supported \
broad dtype list: {}.".format(supported_dtypes)
wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
if len(wrong_dtypes>0):
self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
df_col_set = set(self.df.columns)
dtypes_col_set = set(dtypes.keys())
missing_cols = df_col_set.difference(dtypes_col_set)
Expand Down Expand Up @@ -85,7 +91,7 @@ def random_state(self, new_state):
self._random_state = new_state
random.seed(self.random_state)
except:
print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
self._random_state = None

def __clean_warnings(self):
Expand Down Expand Up @@ -116,7 +122,7 @@ def report(self):
"Prints a report containing all the warnings detected during the data quality analysis."
self.__clean_warnings()
if not self._warnings:
print('No warnings found.')
self._logger.info('No warnings found.')
else:
prio_counts = Counter([warn.priority.value for warn in self._warnings])
print('Warnings count by priority:')
Expand All @@ -133,6 +139,6 @@ def evaluate(self):
try: # if anything fails
results[test] = getattr(self, test)()
except Exception as exc: # print a Warning and log the message
print(f'WARNING: Skipping test {test} due to failure during computation.')
self._logger.warning('Skipping test due to failure during computation. See results folder of this test for further details.')
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}"
return results
4 changes: 2 additions & 2 deletions src/ydata_quality/data_expectations/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,10 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
results['Coverage Fraction'] = self._coverage_fraction(
results_json_path, df, minimum_coverage=minimum_coverage)
except AssertionError as exc: # print a Warning and log the message
print("['DATA EXPECTATIONS'] Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
self._logger.critical("Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
return "[ERROR] Canceled computation. Original exception: "+f"{exc}"
portellaa marked this conversation as resolved.
Show resolved Hide resolved
else:
print("A valid DataFrame was not passed, skipping coverage fraction test.")
self._logger.error("A valid DataFrame was not passed, skipping coverage fraction test.")
results['Overall Assessment'] = self._overall_assessment(results_json_path, error_tol, rel_error_tol)
results['Expectation Level Assessment'] = self._expectation_level_assessment(results_json_path)
return results
16 changes: 10 additions & 6 deletions src/ydata_quality/data_relations/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,16 @@ def dtypes(self):
def dtypes(self, df_dtypes: Tuple[pd.DataFrame, dict]):
df, dtypes = df_dtypes
if not isinstance(dtypes, dict):
raise ValueError("Property 'dtypes' should be a dictionary.")
assert all(col in df.columns for col in dtypes), "All dtypes keys \
must be columns in the dataset."
self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
dtypes = {}
cols_not_in_df = [col for col in dtypes if col not in df.columns]
if len(cols_not_in_df) > 0:
self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
supported_dtypes = ['numerical', 'categorical']
assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes\
must be in the supported broad dtype list: {}.".format(supported_dtypes)
wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
if len(wrong_dtypes)>0:
self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
df_col_set = set(df.columns)
dtypes_col_set = set(dtypes.keys())
missing_cols = df_col_set.difference(dtypes_col_set)
Expand Down Expand Up @@ -77,7 +81,7 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
results['Confounders'] = self._confounder_detection(corr_mat, p_corr_mat, corr_th)
results['Colliders'] = self._collider_detection(corr_mat, p_corr_mat, corr_th)
else:
print('[DATA RELATIONS] The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
self._logger.warning('The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
if label:
results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th)
Expand Down
8 changes: 4 additions & 4 deletions src/ydata_quality/drift/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def ref_label_drift(self, p_thresh: float= 0.05):
Args:
p_thresh (float): The p_threshold used for the test."""
if self.label is None:
print("[REFERENCE LABEL DRIFT] No label was provided. Test skipped.")
self._logger.warning("No label was provided. Test skipped.")
return
labels = self._remaining_data[self.label].copy()
holdout = self._holdout[self.label]
Expand Down Expand Up @@ -280,7 +280,7 @@ def sample_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
description=f"""There were {n_invalid_tests} invalid tests found. This is likely due to a small test sample size. The data summary should be analyzed before considering the test conclusive."""
))
else:
print("[SAMPLE COVARIATE DRIFT] Covariate drift was not detected in the test sample.")
self._logger.info("Covariate drift was not detected in the test sample.")
return test_summary

def sample_label_drift(self, p_thresh: float= 0.05) -> pd.Series:
Expand Down Expand Up @@ -312,7 +312,7 @@ def sample_label_drift(self, p_thresh: float= 0.05) -> pd.Series:
description="The test was invalid. This is likely due to a small test sample size."
))
else:
print("[SAMPLE LABEL DRIFT] Label drift was not detected in the test sample.")
self._logger.info("Label drift was not detected in the test sample.")
return test_summary

def sample_concept_drift(self, p_thresh: float= 0.05) -> pd.Series:
Expand Down Expand Up @@ -349,5 +349,5 @@ def sample_concept_drift(self, p_thresh: float= 0.05) -> pd.Series:
description="The test was invalid. This is likely due to a small test sample size."
))
else:
print("[CONCEPT DRIFT] Concept drift was not detected between the reference and the test samples.")
self._logger.info("Concept drift was not detected between the reference and the test samples.")
return test_summary
10 changes: 7 additions & 3 deletions src/ydata_quality/duplicates/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ class DuplicateChecker(QualityEngine):
"Engine for running analyis on duplicate records."

def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = [], is_close: bool=False):
"""
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis.
entities (List[Union[str, List[str]]]): entities relevant for duplicate analysis. Passing lists allows composed entities of multiple columns.
is_close (bool): Pass True to use numpy.isclose instead of pandas.equals in column comparison."""
super().__init__(df=df)
self._entities = entities
self._tests = ["exact_duplicates", "entity_duplicates", "duplicate_columns"]
Expand Down Expand Up @@ -58,7 +62,7 @@ def exact_duplicates(self):
description=f"Found {len(dups)} instances with exact duplicate feature values."
))
else:
print("[EXACT DUPLICATES] No exact duplicates were found.")
self._logger.info("No exact duplicates were found.")
dups = None
return dups

Expand Down Expand Up @@ -92,7 +96,7 @@ def entity_duplicates(self, entity: Optional[Union[str, List[str]]] = None):
ent_dups.setdefault(entity_key, {})[val] = dups[(dups[entity].values==val).all(axis=1)]
else: # if entity is not specified
if len(self.entities) == 0:
print("[ENTITY DUPLICATES] There are no entities defined to run the analysis. Skipping the test.")
self._logger.info("There are no entities defined to run the analysis. Skipping the test.")
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
return None
else:
for col in self.entities:
Expand All @@ -111,6 +115,6 @@ def duplicate_columns(self):
)
)
else:
print("[DUPLICATE COLUMNS] No duplicate columns were found.")
self._logger.info("No duplicate columns were found.")
dups = None
return dups
8 changes: 3 additions & 5 deletions src/ydata_quality/erroneous_data/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def flatlines(self, th: int=5, skip: list=[]):
skip: List of columns that will not be target of search for flatlines.
Pass '__index' inside skip list to skip looking for flatlines at the index."""
if self.df_type == DataFrameType.TABULAR:
print('[FLATLINES] The provided DataFrame is not a valid Timeseries type, skipping this test.')
self._logger.info('The provided DataFrame is not a valid Timeseries type, skipping this test.')
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
return None
flatlines = {}
for column in self.df.columns: # Compile flatline index
Expand All @@ -101,7 +101,7 @@ def flatlines(self, th: int=5, skip: list=[]):
))
return flatlines
else:
print("[FLATLINES] No flatline events with a minimum length of {} were found.".format(th))
self._logger.info("No flatline events with a minimum length of %f were found.", th)

def predefined_erroneous_data(self, skip: list=[], short: bool = True):
"""Runs a check against a list of predefined erroneous data values.
Expand All @@ -127,9 +127,7 @@ def predefined_erroneous_data(self, skip: list=[], short: bool = True):
eds.drop(no_ed_cols, axis=1, inplace=True)
eds.drop(no_ed_rows, inplace=True)
if eds.empty:
print("[PREDEFINED ERRONEOUS DATA] No predefined ED values from the set {} were found in the dataset.".format(
self.err_data
))
self._logger.info("No predefined ED values from the set %s were found in the dataset.", self.err_data)
else:
total_eds = eds.sum().sum()
self.store_warning(
Expand Down
Loading