From b147602f883ed8025f6787f84529328453ae0714 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Tue, 21 Sep 2021 19:18:04 +0100 Subject: [PATCH 01/10] add logger --- src/ydata_quality/utils/logger.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 src/ydata_quality/utils/logger.py diff --git a/src/ydata_quality/utils/logger.py b/src/ydata_quality/utils/logger.py new file mode 100644 index 00000000..24a2474e --- /dev/null +++ b/src/ydata_quality/utils/logger.py @@ -0,0 +1,25 @@ +import logging +from typing import TextIO +import sys +import os + +# Default vars for the logger +NAME = os.getenv('DQ_LOGGER_NAME', 'DQ_Logger') +STREAM = sys.stdout +LOG_LEVEL = os.getenv('DQ_LOG_LEVEL', logging.WARNING) + +def create_logger(name, stream: TextIO = sys.stdout, level=logging.INFO): + handler = logging.StreamHandler(stream) + handler.setFormatter( + logging.Formatter( + "%(levelname)s | %(message)s" + ) + ) + + logger = logging.getLogger(name) + logger.setLevel(level) + if len(logger.handlers)==0: + logger.addHandler(handler) + logger.propagate = False + + return logger From 2a2296eb24f486734d997e9f6504e20e1202def9 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Tue, 21 Sep 2021 19:19:18 +0100 Subject: [PATCH 02/10] change target to label in modelling --- src/ydata_quality/utils/modelling.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/ydata_quality/utils/modelling.py b/src/ydata_quality/utils/modelling.py index ae8eb71c..b1d62810 100644 --- a/src/ydata_quality/utils/modelling.py +++ b/src/ydata_quality/utils/modelling.py @@ -21,6 +21,7 @@ from sklearn.utils._testing import ignore_warnings from ydata_quality.utils.enum import PredictionTask +from ydata_quality.utils.auxiliary import infer_dtypes BASELINE_CLASSIFIER = Pipeline([ ('imputer', SimpleImputer()), @@ -51,17 +52,17 @@ def get_prediction_task(df: pd.DataFrame, label: str): return 'regression' @ignore_warnings(category=ConvergenceWarning) -def baseline_predictions(df: pd.DataFrame, target: str, task='classification'): +def baseline_predictions(df: pd.DataFrame, label: str, task='classification'): "Train a baseline model and predict for a test set" # 0. Infer the prediction task - task = get_prediction_task(df=df, label=target) + task = get_prediction_task(df=df, label=label) # 1. Define the baseline model model = BASELINE_CLASSIFIER if task == 'classification' else BASELINE_REGRESSION # 2. Train overall model - X, y = df.drop(target, axis=1), label_binarize(df[target], classes=list(set(df[target]))) + X, y = df.drop(label, axis=1), label_binarize(df[label], classes=list(set(df[label]))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) model.fit(X_train.select_dtypes('number'), y_train) @@ -75,26 +76,26 @@ def baseline_predictions(df: pd.DataFrame, target: str, task='classification'): return y_pred, X_test, y_test @ignore_warnings(category=DataConversionWarning) -def baseline_performance(df: pd.DataFrame, target: str, +def baseline_performance(df: pd.DataFrame, label: str, task: PredictionTask = PredictionTask.CLASSIFICATION, adjusted_metric: bool = False): """Train a baseline model, predict for a test set and return the performance. Args: - df (pd.DataFrame): original dataset - - target (str): name of target feature column + - label (str): name of target feature column - task (PredictionTask): classification, regression - adjusted_metric (bool): if True, return metric as percentage of max achievable performance """ # 0. Infer the prediction task - task = get_prediction_task(df=df, label=target) + task = get_prediction_task(df=df, label=label) # 1. Define the baseline performance metric metric = roc_auc_score if task == 'classification' else mean_squared_error # 2. Get the baseline predictions - y_pred, _, y_test = baseline_predictions(df=df, target=target, task=task) + y_pred, _, y_test = baseline_predictions(df=df, label=label, task=task) # 3. Get the performance if adjusted_metric: @@ -119,17 +120,17 @@ def adjusted_performance(y_true, y_pred, task: PredictionTask, metric: callable) return (real_perf - base_perf) / (best_perf - base_perf) @ignore_warnings(category=DataConversionWarning) -def performance_per_feature_values(df: pd.DataFrame, feature: str, target: str, task='classification'): +def performance_per_feature_values(df: pd.DataFrame, feature: str, label: str, task='classification'): """Performance achieved per each value of a groupby feature.""" # 0. Infer the prediction task - task = get_prediction_task(df=df, label=target) + task = get_prediction_task(df=df, label=label) # 1. Define the baseline performance metric metric = roc_auc_score if task == 'classification' else mean_squared_error # 2. Get the baseline predictions - y_pred, X_test, y_test = baseline_predictions(df=df, target=target, task=task) + y_pred, X_test, y_test = baseline_predictions(df=df, label=label, task=task) # 3. Get the performances per feature value uniques = set(X_test[feature]) @@ -144,17 +145,17 @@ def performance_per_feature_values(df: pd.DataFrame, feature: str, target: str, return results -def performance_per_missing_value(df: pd.DataFrame, feature: str, target: str, task='classification'): +def performance_per_missing_value(df: pd.DataFrame, feature: str, label: str, task='classification'): """Performance difference between valued and missing values in feature.""" # 0. Infer the prediction task - task = get_prediction_task(df=df, label=target) + task = get_prediction_task(df=df, label=label) # 1. Define the baseline performance metric metric = roc_auc_score if task == 'classification' else mean_squared_error # 2. Get the baseline predictions - y_pred, X_test, y_test = baseline_predictions(df=df, target=target, task=task) + y_pred, X_test, y_test = baseline_predictions(df=df, label=label, task=task) # 3. Get the performance per valued vs missing feature missing_mask = X_test[feature].isna() From e84482f41fbf053cb2001b5d3a8d66a8bd283174 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Tue, 21 Sep 2021 19:20:01 +0100 Subject: [PATCH 03/10] instantiating loggers in engine --- src/ydata_quality/core/data_quality.py | 17 ++++++++++------- src/ydata_quality/core/engine.py | 26 ++++++++++++++++---------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py index 463194e4..609ef5d7 100644 --- a/src/ydata_quality/core/data_quality.py +++ b/src/ydata_quality/core/data_quality.py @@ -15,6 +15,7 @@ from ydata_quality.data_expectations import DataExpectationsReporter from ydata_quality.bias_fairness import BiasFairness from ydata_quality.data_relations import DataRelationsDetector +from ydata_quality.utils.logger import * class DataQuality: "DataQuality contains the multiple data quality engines." @@ -55,7 +56,7 @@ def __init__(self, label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted. If not specified, LABELLING is skipped. random_state (int, optional): Integer seed for random reproducibility. Default is None. - Set to None for fully random behaviour, no reproducibility. + Set to None for fully random behavior, no reproducibility. entities: [DUPLICATES] entities relevant for duplicate analysis. is_close: [DUPLICATES] Pass True to use numpy.isclose instead of pandas.equals in column comparison. ed_extensions: [ERRONEOUS DATA] A list of user provided erroneous data values to append to defaults. @@ -75,10 +76,12 @@ def __init__(self, #TODO: Refactor legacy engines (property based) and logic in this class to new base (lean objects) self.df = df self._warnings = list() + self._logger = create_logger(NAME, STREAM, LOG_LEVEL) self._random_state = random_state + self._engines_legacy = { # Default list of engines 'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close), - 'missings': MissingsProfiler(df=df, target=label, random_state=self.random_state), + 'missings': MissingsProfiler(df=df, label=label, random_state=self.random_state), 'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions), 'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state) } @@ -96,16 +99,16 @@ def __init__(self, if label is not None: self._engines_legacy['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state) else: - print('Label is not defined. Skipping LABELLING engine.') + self._logger.warning('Label is not defined. Skipping LABELLING engine.') if len(sensitive_features)>0: self._engines_legacy['bias&fairness'] = BiasFairness(df=df, sensitive_features=sensitive_features, label=label, random_state=self.random_state) else: - print('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.') + self._logger.warning('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.') if results_json_path is not None: self._engines_new['expectations'] = DataExpectationsReporter() else: - print('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.') + self._logger.warning('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.') def __clean_warnings(self): @@ -140,7 +143,7 @@ def random_state(self, new_state): if new_state==None or (isinstance(new_state, int) and new_state>=0): self._random_state = new_state else: - print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).') + self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).') self._random_state = None def __store_warnings(self): @@ -158,7 +161,7 @@ def report(self): self.__store_warnings() # fetch all warnings from the engines self.__clean_warnings() if not self._warnings: - print('No warnings found.') + self._logger.info('No warnings found.') else: prio_counts = Counter([warn.priority.value for warn in self._warnings]) print('Warnings count by priority:') diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py index f4da37b4..539c82a1 100644 --- a/src/ydata_quality/core/engine.py +++ b/src/ydata_quality/core/engine.py @@ -11,6 +11,7 @@ from ydata_quality.core.warnings import Priority, QualityWarning from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes from ydata_quality.utils.enum import DataFrameType +from ydata_quality.utils.logger import * class QualityEngine(ABC): @@ -20,6 +21,7 @@ def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: self._df = df self._df_type = None self._warnings = list() + self._logger = create_logger(NAME, STREAM, LOG_LEVEL) self._tests = [] self._label = label self._dtypes = dtypes @@ -37,9 +39,8 @@ def label(self): @label.setter def label(self, label: str): - if not isinstance(label, str): - raise ValueError("Property 'label' should be a string.") - assert label in self.df.columns, "Given label should exist as a DataFrame column." + assert isinstance(label, str), "Property 'label' should be a string." + assert label in self.df.columns, "Provided label %s does not exist as a DataFrame column." % label self._label = label @property @@ -52,11 +53,16 @@ def dtypes(self): @dtypes.setter def dtypes(self, dtypes: dict): if not isinstance(dtypes, dict): - raise ValueError("Property 'dtypes' should be a dictionary.") - assert all(col in self.df.columns for col in dtypes), "All dtypes keys must be columns in the dataset." + self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.") + dtypes = {} + cols_not_in_df = [col for col in dtypes if col not in self.df.columns] + if len(cols_not_in_df) > 0: + self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df) supported_dtypes = ['numerical', 'categorical'] - assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes must be in the supported \ -broad dtype list: {}.".format(supported_dtypes) + wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes] + if len(wrong_dtypes>0): + self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes) + dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes} df_col_set = set(self.df.columns) dtypes_col_set = set(dtypes.keys()) missing_cols = df_col_set.difference(dtypes_col_set) @@ -85,7 +91,7 @@ def random_state(self, new_state): self._random_state = new_state random.seed(self.random_state) except: - print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.') + self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.') self._random_state = None def __clean_warnings(self): @@ -116,7 +122,7 @@ def report(self): "Prints a report containing all the warnings detected during the data quality analysis." self.__clean_warnings() if not self._warnings: - print('No warnings found.') + self._logger.info('No warnings found.') else: prio_counts = Counter([warn.priority.value for warn in self._warnings]) print('Warnings count by priority:') @@ -133,6 +139,6 @@ def evaluate(self): try: # if anything fails results[test] = getattr(self, test)() except Exception as exc: # print a Warning and log the message - print(f'WARNING: Skipping test {test} due to failure during computation.') + self._logger.warning('Skipping test due to failure during computation. See results folder of this test for further details.') results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}" return results From ab018d4c9848563a88404b892919522ed1180da6 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Tue, 21 Sep 2021 19:20:32 +0100 Subject: [PATCH 04/10] change prints to logs --- src/ydata_quality/bias_fairness/engine.py | 3 +- src/ydata_quality/data_expectations/engine.py | 4 +-- src/ydata_quality/data_relations/engine.py | 16 +++++---- src/ydata_quality/drift/engine.py | 8 ++--- src/ydata_quality/duplicates/engine.py | 10 ++++-- src/ydata_quality/erroneous_data/engine.py | 8 ++--- src/ydata_quality/labelling/engine.py | 12 +++---- src/ydata_quality/missings/engine.py | 34 +++++++------------ 8 files changed, 45 insertions(+), 50 deletions(-) diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py index c9c2ee38..5c63ff09 100644 --- a/src/ydata_quality/bias_fairness/engine.py +++ b/src/ydata_quality/bias_fairness/engine.py @@ -93,8 +93,7 @@ def performance_discrimination(self): """ # TODO: support error rate parity metrics (e.g. false positive rate, positive rate) if self.label is None: - print('[BIAS&FAIRNESS] Argument "label" must be defined to calculate performance discrimination metric. Skipping test.') - pass + self._logger.warning('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.') res = {} for feat in self.sensitive_features: diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py index 07e20b15..82b0915a 100644 --- a/src/ydata_quality/data_expectations/engine.py +++ b/src/ydata_quality/data_expectations/engine.py @@ -183,10 +183,10 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i results['Coverage Fraction'] = self._coverage_fraction( results_json_path, df, minimum_coverage=minimum_coverage) except AssertionError as exc: # print a Warning and log the message - print("['DATA EXPECTATIONS'] Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.") + self._logger.critical("Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.") return "[ERROR] Canceled computation. Original exception: "+f"{exc}" else: - print("A valid DataFrame was not passed, skipping coverage fraction test.") + self._logger.error("A valid DataFrame was not passed, skipping coverage fraction test.") results['Overall Assessment'] = self._overall_assessment(results_json_path, error_tol, rel_error_tol) results['Expectation Level Assessment'] = self._expectation_level_assessment(results_json_path) return results diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py index 196fa75c..ca648f38 100644 --- a/src/ydata_quality/data_relations/engine.py +++ b/src/ydata_quality/data_relations/engine.py @@ -29,12 +29,16 @@ def dtypes(self): def dtypes(self, df_dtypes: Tuple[pd.DataFrame, dict]): df, dtypes = df_dtypes if not isinstance(dtypes, dict): - raise ValueError("Property 'dtypes' should be a dictionary.") - assert all(col in df.columns for col in dtypes), "All dtypes keys \ - must be columns in the dataset." + self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.") + dtypes = {} + cols_not_in_df = [col for col in dtypes if col not in df.columns] + if len(cols_not_in_df) > 0: + self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df) supported_dtypes = ['numerical', 'categorical'] - assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes\ - must be in the supported broad dtype list: {}.".format(supported_dtypes) + wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes] + if len(wrong_dtypes)>0: + self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes) + dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes} df_col_set = set(df.columns) dtypes_col_set = set(dtypes.keys()) missing_cols = df_col_set.difference(dtypes_col_set) @@ -77,7 +81,7 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N results['Confounders'] = self._confounder_detection(corr_mat, p_corr_mat, corr_th) results['Colliders'] = self._collider_detection(corr_mat, p_corr_mat, corr_th) else: - print('[DATA RELATIONS] The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.') + self._logger.warning('The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.') if label: results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th) results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th) diff --git a/src/ydata_quality/drift/engine.py b/src/ydata_quality/drift/engine.py index d209ea4a..1edeb219 100644 --- a/src/ydata_quality/drift/engine.py +++ b/src/ydata_quality/drift/engine.py @@ -223,7 +223,7 @@ def ref_label_drift(self, p_thresh: float= 0.05): Args: p_thresh (float): The p_threshold used for the test.""" if self.label is None: - print("[REFERENCE LABEL DRIFT] No label was provided. Test skipped.") + self._logger.warning("No label was provided. Test skipped.") return labels = self._remaining_data[self.label].copy() holdout = self._holdout[self.label] @@ -280,7 +280,7 @@ def sample_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame: description=f"""There were {n_invalid_tests} invalid tests found. This is likely due to a small test sample size. The data summary should be analyzed before considering the test conclusive.""" )) else: - print("[SAMPLE COVARIATE DRIFT] Covariate drift was not detected in the test sample.") + self._logger.info("Covariate drift was not detected in the test sample.") return test_summary def sample_label_drift(self, p_thresh: float= 0.05) -> pd.Series: @@ -312,7 +312,7 @@ def sample_label_drift(self, p_thresh: float= 0.05) -> pd.Series: description="The test was invalid. This is likely due to a small test sample size." )) else: - print("[SAMPLE LABEL DRIFT] Label drift was not detected in the test sample.") + self._logger.info("Label drift was not detected in the test sample.") return test_summary def sample_concept_drift(self, p_thresh: float= 0.05) -> pd.Series: @@ -349,5 +349,5 @@ def sample_concept_drift(self, p_thresh: float= 0.05) -> pd.Series: description="The test was invalid. This is likely due to a small test sample size." )) else: - print("[CONCEPT DRIFT] Concept drift was not detected between the reference and the test samples.") + self._logger.info("Concept drift was not detected between the reference and the test samples.") return test_summary diff --git a/src/ydata_quality/duplicates/engine.py b/src/ydata_quality/duplicates/engine.py index e84ed002..ffb8616c 100644 --- a/src/ydata_quality/duplicates/engine.py +++ b/src/ydata_quality/duplicates/engine.py @@ -14,6 +14,10 @@ class DuplicateChecker(QualityEngine): "Engine for running analyis on duplicate records." def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = [], is_close: bool=False): + """ + df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis. + entities (List[Union[str, List[str]]]): entities relevant for duplicate analysis. Passing lists allows composed entities of multiple columns. + is_close (bool): Pass True to use numpy.isclose instead of pandas.equals in column comparison.""" super().__init__(df=df) self._entities = entities self._tests = ["exact_duplicates", "entity_duplicates", "duplicate_columns"] @@ -58,7 +62,7 @@ def exact_duplicates(self): description=f"Found {len(dups)} instances with exact duplicate feature values." )) else: - print("[EXACT DUPLICATES] No exact duplicates were found.") + self._logger.info("No exact duplicates were found.") dups = None return dups @@ -92,7 +96,7 @@ def entity_duplicates(self, entity: Optional[Union[str, List[str]]] = None): ent_dups.setdefault(entity_key, {})[val] = dups[(dups[entity].values==val).all(axis=1)] else: # if entity is not specified if len(self.entities) == 0: - print("[ENTITY DUPLICATES] There are no entities defined to run the analysis. Skipping the test.") + self._logger.info("There are no entities defined to run the analysis. Skipping the test.") return None else: for col in self.entities: @@ -111,6 +115,6 @@ def duplicate_columns(self): ) ) else: - print("[DUPLICATE COLUMNS] No duplicate columns were found.") + self._logger.info("No duplicate columns were found.") dups = None return dups diff --git a/src/ydata_quality/erroneous_data/engine.py b/src/ydata_quality/erroneous_data/engine.py index c6d1f599..25b6b7ff 100644 --- a/src/ydata_quality/erroneous_data/engine.py +++ b/src/ydata_quality/erroneous_data/engine.py @@ -83,7 +83,7 @@ def flatlines(self, th: int=5, skip: list=[]): skip: List of columns that will not be target of search for flatlines. Pass '__index' inside skip list to skip looking for flatlines at the index.""" if self.df_type == DataFrameType.TABULAR: - print('[FLATLINES] The provided DataFrame is not a valid Timeseries type, skipping this test.') + self._logger.info('The provided DataFrame is not a valid Timeseries type, skipping this test.') return None flatlines = {} for column in self.df.columns: # Compile flatline index @@ -101,7 +101,7 @@ def flatlines(self, th: int=5, skip: list=[]): )) return flatlines else: - print("[FLATLINES] No flatline events with a minimum length of {} were found.".format(th)) + self._logger.info("No flatline events with a minimum length of %f were found.", th) def predefined_erroneous_data(self, skip: list=[], short: bool = True): """Runs a check against a list of predefined erroneous data values. @@ -127,9 +127,7 @@ def predefined_erroneous_data(self, skip: list=[], short: bool = True): eds.drop(no_ed_cols, axis=1, inplace=True) eds.drop(no_ed_rows, inplace=True) if eds.empty: - print("[PREDEFINED ERRONEOUS DATA] No predefined ED values from the set {} were found in the dataset.".format( - self.err_data - )) + self._logger.info("No predefined ED values from the set %s were found in the dataset.", self.err_data) else: total_eds = eds.sum().sum() self.store_warning( diff --git a/src/ydata_quality/labelling/engine.py b/src/ydata_quality/labelling/engine.py index ab671118..98585ab4 100644 --- a/src/ydata_quality/labelling/engine.py +++ b/src/ydata_quality/labelling/engine.py @@ -61,7 +61,7 @@ def missing_labels(self): description=f"Found {len(missing_labels)} instances with missing labels." )) else: - print("[MISSING LABELS] No missing labels were found.") + self._logger.info("No missing labels were found.") missing_labels = None return missing_labels @@ -112,7 +112,7 @@ def few_labels(self, count_th: Union[int, float] = 1): "Found {} labels with {} or less records.".format(len(few_labels), count_th) )) else: - print("[FEW LABELS] No labels with {} or less records were found.".format(count_th)) + self._logger.info("No labels with %d or less records were found.", count_th) few_labels = None return few_labels @@ -149,7 +149,7 @@ def unbalanced_classes(self, slack: float = 0.3): set(data['Over-represented'].keys()), fair_share+adj_slack) )) else: - print("[UNBALANCED CLASSES] No unbalanced classes were found.") + self._logger.info("No unbalanced classes were found.") return None return label_excess.index @@ -299,9 +299,9 @@ def test_normality(self, p_th=5e-3): test_result, transform, pstat = normality_test(vals, p_th=p_th) if test_result: if transform is None: - print("[TEST NORMALITY] The label values appears to be normally distributed.") + self._logger.info("The label values appears to be normally distributed.") else: - print("[TEST NORMALITY] The {} transform appears to be able to normalize the label values.".format(transform)) + self._logger.info("The %s transform appears to be able to normalize the label values.", transform) self.store_warning( QualityWarning( test='Test normality', category='Labels', priority=2, data=vals, @@ -310,7 +310,7 @@ def test_normality(self, p_th=5e-3): transform, pstat) )) else: - print("[TEST NORMALITY] It was not possible to normalize the label values. See the warning message for additional context.") + self._logger.warning("It was not possible to normalize the label values. See the data quality warning message for additional context.") self.store_warning( QualityWarning( test='Test normality', category='Labels', priority=1, data=vals, diff --git a/src/ydata_quality/missings/engine.py b/src/ydata_quality/missings/engine.py index 2cb69abc..992e0af0 100644 --- a/src/ydata_quality/missings/engine.py +++ b/src/ydata_quality/missings/engine.py @@ -15,27 +15,18 @@ class MissingsProfiler(QualityEngine): "Main class to run missing value analysis." - def __init__(self, df: pd.DataFrame, target: Optional[str] = None, random_state: Optional[int]=None): + def __init__(self, df: pd.DataFrame, label: Optional[str] = None, random_state: Optional[int]=None): """ Args: df (pd.DataFrame): reference DataFrame used to run the missing value analysis. - target (str, optional): target + label (str, optional): target feature to be predicted. + random_state (int, optional): Integer seed for random reproducibility. Default is None. + Set to None for fully random behavior, no reproducibility. """ - #TODO: Rename 'target' argument to 'label' standard of QualityEngine super().__init__(df=df, random_state=random_state) - self._target = target + self._label = label self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"] - @property - def target(self): - return self._target - - @target.setter - def target(self, target: str): - if target not in self.df.columns: - raise Exception(f'Provided target ({target}) must belong to the dataframe columns ({list(self.df.columns)}).') - self._target = target - def _get_null_cols(self, col: Optional[str] = None) -> List[str]: "Returns list of given column or all columns with null values in DataFrame if None." return list(self.df.columns[self.null_count(minimal=False)>0]) if col is None \ @@ -43,9 +34,9 @@ def _get_null_cols(self, col: Optional[str] = None) -> List[str]: else [col] def __get_prediction_type(self): - "Decide whether to use classification or regression setting, based on target." + "Decide whether to use classification or regression setting, based on label." # TODO: Improve prediction type guesstimate based on alternative heuristics (e.g. dtypes, value_counts) - if len(set(self.df[self.target])) == 2: # binary classification + if len(set(self.df[self.label])) == 2: # binary classification return 'classification' else: return 'regression' @@ -128,21 +119,20 @@ def performance_drop(self, col: Union[List[str], str, None] = None, normalize=Tr # Parse the columns for which to calculate the drop in performance on missings cols = self._get_null_cols(col) - # Guarantee that target is defined. Otherwise skip - if self.target is None: - print('Argument "target" must be defined to calculate performance_drop metric. Skipping test.') - pass + # Guarantee that label is defined. Otherwise skip + if self.label is None: + self._logger.warning('Argument "label" must be defined to calculate performance_drop metric. Skipping test.') # Guesstimate the prediction type prediction_type = self.__get_prediction_type() results = pd.DataFrame({ - c: performance_per_missing_value(df=self.df, feature=c, target=self.target, task=prediction_type) + c: performance_per_missing_value(df=self.df, feature=c, label=self.label, task=prediction_type) for c in cols }) # Normalize the results with a baseline performance. if normalize: - baseline = baseline_performance(df=self.df, target=self.target, task=prediction_type) + baseline = baseline_performance(df=self.df, label=self.label, task=prediction_type) results = results / baseline return results From a42b95ba25b7b32a7c20a9fea2ef832476d2fc3d Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Tue, 21 Sep 2021 20:05:50 +0100 Subject: [PATCH 05/10] Fix runtime issues in aux methods --- src/ydata_quality/core/engine.py | 1 - src/ydata_quality/data_expectations/engine.py | 7 +- src/ydata_quality/data_relations/engine.py | 9 +- src/ydata_quality/utils/auxiliary.py | 14 +-- tutorials/data_quality.ipynb | 61 +++++-------- tutorials/main.ipynb | 88 ++++++++++++++----- 6 files changed, 102 insertions(+), 78 deletions(-) diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py index 539c82a1..b40060d9 100644 --- a/src/ydata_quality/core/engine.py +++ b/src/ydata_quality/core/engine.py @@ -10,7 +10,6 @@ from ydata_quality.core.warnings import Priority, QualityWarning from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes -from ydata_quality.utils.enum import DataFrameType from ydata_quality.utils.logger import * diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py index 82b0915a..72a55431 100644 --- a/src/ydata_quality/data_expectations/engine.py +++ b/src/ydata_quality/data_expectations/engine.py @@ -8,6 +8,7 @@ from ydata_quality.core import QualityEngine, QualityWarning from ydata_quality.utils.auxiliary import test_load_json_path +from ydata_quality.utils.logger import * class DataExpectationsReporter(QualityEngine): @@ -15,8 +16,9 @@ class DataExpectationsReporter(QualityEngine): Supports standard Great Expectations json reports from expectation suite validation runs. """ - def __init__(self): - return # Override the base class init method + def __init__(self): # Overrides base class init + self._warnings = [] # reset the warnings to avoid duplicates + self._logger = create_logger(NAME, STREAM, LOG_LEVEL) @property def tests(self): @@ -175,7 +177,6 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i rel_error_tol (float): Defines the maximum fraction of failed expectations, overrides error_tol. minimum_coverage (float): Minimum expected fraction of DataFrame columns covered by the expectation suite. """ - self._warnings = list() # reset the warnings to avoid duplicates df = df if isinstance(df, pd.DataFrame) else None results = {} if df is not None: diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py index ca648f38..823748ac 100644 --- a/src/ydata_quality/data_relations/engine.py +++ b/src/ydata_quality/data_relations/engine.py @@ -9,13 +9,15 @@ from ydata_quality.core import QualityEngine, QualityWarning from ydata_quality.utils.correlations import correlation_matrix, partial_correlation_matrix, correlation_plotter, vif_collinearity, chi2_collinearity from ydata_quality.utils.auxiliary import infer_dtypes, standard_normalize +from ydata_quality.utils.logger import * class DataRelationsDetector(QualityEngine): """Main class to run data relations analysis. """ - def __init__(self): - return # Override the base class init method + def __init__(self): # Overrides base class init + self._warnings = [] # reset the warnings to avoid duplicates + self._logger = create_logger(NAME, STREAM, LOG_LEVEL) @property def tests(self): @@ -64,9 +66,6 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output. """ assert label in df.columns or not label, "The provided label name does not exist as a column in the dataset" - self._warnings = [] # reset the warnings to avoid duplicates - if not dtypes: - dtypes = {} self.dtypes = (df, dtypes) # Consider refactoring QualityEngine dtypes (df as argument of setter) df = standard_normalize(df, dtypes) results = {} diff --git a/src/ydata_quality/utils/auxiliary.py b/src/ydata_quality/utils/auxiliary.py index d00c9e78..a277754e 100644 --- a/src/ydata_quality/utils/auxiliary.py +++ b/src/ydata_quality/utils/auxiliary.py @@ -50,9 +50,10 @@ def min_max_normalize(df: pd.DataFrame, dtypes: dict) -> pd.DataFrame: Args: df (pd.DataFrame): DataFrame to be normalized dtypes (dict): Map of column names to variable types""" - numeric_features = [col for col in df.columns if dtypes[col]=='numerical'] - scaled_data = MinMaxScaler().fit_transform(df[numeric_features].values) - df[numeric_features] = scaled_data + numeric_features = [col for col in df.columns if dtypes.get(col)=='numerical'] + if numeric_features: + scaled_data = MinMaxScaler().fit_transform(df[numeric_features].values) + df[numeric_features] = scaled_data return df def standard_normalize(df: pd.DataFrame, dtypes: dict) -> pd.DataFrame: @@ -61,9 +62,10 @@ def standard_normalize(df: pd.DataFrame, dtypes: dict) -> pd.DataFrame: Args: df (pd.DataFrame): DataFrame to be normalized dtypes (dict): Map of column names to variable types""" - numeric_features = [col for col in df.columns if dtypes[col]=='numerical'] - scaled_data = StandardScaler().fit_transform(df[numeric_features].values) - df[numeric_features] = scaled_data + numeric_features = [col for col in df.columns if dtypes.get(col)=='numerical'] + if numeric_features: + scaled_data = StandardScaler().fit_transform(df[numeric_features].values) + df[numeric_features] = scaled_data return df def find_duplicate_columns(df: pd.DataFrame, is_close=False) -> dict: diff --git a/tutorials/data_quality.ipynb b/tutorials/data_quality.ipynb index 75fbc03f..471b7d82 100644 --- a/tutorials/data_quality.ipynb +++ b/tutorials/data_quality.ipynb @@ -24,7 +24,6 @@ "execution_count": 1, "source": [ "import pandas as pd\n", - "import numpy as np\n", "from ydata_quality import DataQuality" ], "outputs": [], @@ -97,10 +96,8 @@ "output_type": "stream", "name": "stdout", "text": [ - "[ENTITY DUPLICATES] There are no entities defined to run the analysis. Skipping the test.\n", - "WARNING: Skipping test predict_missings due to failure during computation.\n", - "[PREDEFINED ERRONEOUS DATA] No predefined ED values from the set {'', 'unknown', 'unk', 'a_custom_edv', '?', '(blank)', 'na', '!', 'n/a', 999999999} were found in the dataset.\n", - "WARNING: Skipping test ref_covariate_drift due to failure during computation.\n" + "WARNING | Skipping test due to failure during computation. See results folder of this test for further details.\n", + "WARNING | Skipping test due to failure during computation. See results folder of this test for further details.\n" ] }, { @@ -117,19 +114,9 @@ "output_type": "stream", "name": "stdout", "text": [ - "[MISSING LABELS] No missing labels were found.\n", - "[TEST NORMALITY] It was not possible to normalize the label values. See the warning message for additional context.\n", - "WARNING: Skipping test performance_discrimination due to failure during computation.\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/home/fsantos/GitRepos/ydata-quality/src/ydata_quality/utils/correlations.py:132: RuntimeWarning: invalid value encountered in sqrt\n", - " scaled_diag = np.diag(np.sqrt(1 / diag))\n", - "/home/fsantos/miniconda3/envs/DQ/lib/python3.8/site-packages/statsmodels/stats/outliers_influence.py:193: RuntimeWarning: divide by zero encountered in double_scalars\n", - " vif = 1. / (1. - r_squared_i)\n" + "WARNING | It was not possible to normalize the label values. See the data quality warning message for additional context.\n", + "WARNING | Skipping test due to failure during computation. See results folder of this test for further details.\n", + "WARNING | Skipping test due to failure during computation. See results folder of this test for further details.\n" ] }, { @@ -142,21 +129,12 @@ }, "metadata": {} }, - { - "output_type": "display_data", - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {} - }, { "output_type": "stream", "name": "stdout", "text": [ - "['DATA EXPECTATIONS'] Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.\n" + "WARNING | The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.\n", + "CRITICAL | Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.\n" ] } ], @@ -184,19 +162,24 @@ "text": [ "Warnings count by priority:\n", "\tPriority 1: 2 warning(s)\n", - "\tPriority 2: 8 warning(s)\n", - "\tTOTAL: 10 warning(s)\n", + "\tPriority 2: 5 warning(s)\n", + "\tTOTAL: 7 warning(s)\n", "List of warnings sorted by priority:\n", "\t[TEST NORMALITY] The label distribution failed to pass a normality test as-is and following a battery of transforms. It is possible that the data originates from an exotic distribution, there is heavy outlier presence or it is multimodal. Addressing this issue might prove critical for regressor performance. (Priority 1: heavy impact expected)\n", "\t[DUPLICATE COLUMNS] Found 1 columns with exactly the same feature values as other columns. (Priority 1: heavy impact expected)\n", - "\t[COLLIDER CORRELATIONS] Found 99 independently uncorrelated variable pairs that showed correlation after controling for the remaining variables. This is an indicator of potential colliding bias with other covariates. (Priority 2: usage allowed, limited human intelligibility)\n", "\t[OUTLIER DETECTION] Found 2 potential outliers across the full dataset. A distance bigger than 3.0 standard deviations of intra-cluster distances to the respective centroids was used to define the potential outliers. (Priority 2: usage allowed, limited human intelligibility)\n", "\t[HIGH COLLINEARITY - CATEGORICAL] Found 3 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove variables following the provided order. (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[PROXY IDENTIFICATION] Found 5 feature pairs of correlation to sensitive attributes with values higher than defined threshold (0.5). (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[CONFOUNDED CORRELATIONS] Found 21 independently correlated variable pairs that disappeared after controling for the remaining variables. This is an indicator of potential confounder effects in the dataset. (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[HIGH COLLINEARITY - NUMERICAL] Found 4 numerical variables with high Variance Inflation Factor (VIF>5.0). The variables listed in results are highly collinear with other variables in the dataset. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove the highest VIF variables. (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[FLATLINES] Found 8 flatline events with a minimun length of 5 among the columns {'MainCity', 'Region'}. (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[EXACT DUPLICATES] Found 20 instances with exact duplicate feature values. (Priority 2: usage allowed, limited human intelligibility)\n" + "\t[EXACT DUPLICATES] Found 20 instances with exact duplicate feature values. (Priority 2: usage allowed, limited human intelligibility)\n", + "\t[HIGH COLLINEARITY - NUMERICAL] Found 18 numerical variables with high Variance Inflation Factor (VIF>5.0). The variables listed in results are highly collinear with other variables in the dataset. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove the highest VIF variables. (Priority 2: usage allowed, limited human intelligibility)\n", + "\t[PROXY IDENTIFICATION] Found 5 feature pairs of correlation to sensitive attributes with values higher than defined threshold (0.5). (Priority 2: usage allowed, limited human intelligibility)\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/fsantos/miniconda3/envs/DQ/lib/python3.8/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" ] } ], @@ -231,8 +214,8 @@ "output_type": "execute_result", "data": { "text/plain": [ - "('Outlier Detection',\n", - " 'Found 2 potential outliers across the full dataset. A distance bigger than 3.0 standard deviations of intra-cluster distances to the respective centroids was used to define the potential outliers.',\n", + "('High Collinearity - Categorical',\n", + " 'Found 3 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove variables following the provided order.',\n", " )" ] }, diff --git a/tutorials/main.ipynb b/tutorials/main.ipynb index f36f970a..cb545f70 100644 --- a/tutorials/main.ipynb +++ b/tutorials/main.ipynb @@ -53,13 +53,22 @@ "text": [ "Warnings count by priority:\n", "\tPriority 1: 1 warning(s)\n", - "\tPriority 2: 3 warning(s)\n", - "\tTOTAL: 4 warning(s)\n", + "\tPriority 2: 4 warning(s)\n", + "\tTOTAL: 5 warning(s)\n", "List of warnings sorted by priority:\n", "\t[DUPLICATE COLUMNS] Found 1 columns with exactly the same feature values as other columns. (Priority 1: heavy impact expected)\n", + "\t[PREDEFINED ERRONEOUS DATA] Found 1960 ED values in the dataset. (Priority 2: usage allowed, limited human intelligibility)\n", + "\t[HIGH COLLINEARITY - CATEGORICAL] Found 10 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove variables following the provided order. (Priority 2: usage allowed, limited human intelligibility)\n", "\t[EXACT DUPLICATES] Found 3 instances with exact duplicate feature values. (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[FLATLINES] Found 4627 flatline events with a minimun length of 5 among the columns {'marital-status', 'workclass', 'income', 'native-country', 'capital-gain', 'capital-loss', 'education', 'occupation', 'workclass2', 'sex', 'education-num', 'hours-per-week', 'relationship', 'race'}. (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[PREDEFINED ERRONEOUS DATA] Found 1960 ED values in the dataset. (Priority 2: usage allowed, limited human intelligibility)\n" + "\t[HIGH COLLINEARITY - NUMERICAL] Found 3 numerical variables with high Variance Inflation Factor (VIF>5.0). The variables listed in results are highly collinear with other variables in the dataset. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove the highest VIF variables. (Priority 2: usage allowed, limited human intelligibility)\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/fsantos/miniconda3/envs/DQ/lib/python3.8/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" ] } ], @@ -95,7 +104,7 @@ "output_type": "execute_result", "data": { "text/plain": [ - "{'workclass': 'workclass2'}" + "{'workclass': ['workclass2']}" ] }, "metadata": {}, @@ -144,6 +153,14 @@ "\t[PROXY IDENTIFICATION] Found 1 feature pairs of correlation to sensitive attributes with values higher than defined threshold (0.5). (Priority 2: usage allowed, limited human intelligibility)\n", "\t[SENSITIVE ATTRIBUTE REPRESENTATIVITY] Found 2 values of 'race' sensitive attribute with low representativity in the dataset (below 1.00%). (Priority 2: usage allowed, limited human intelligibility)\n" ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/fsantos/GitRepos/ydata-quality/src/ydata_quality/bias_fairness/engine.py:72: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.\n", + " performances = pd.Series(index=self.sensitive_features)\n" + ] } ], "metadata": {} @@ -155,25 +172,23 @@ "bf_results" ], "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/fsantos/miniconda3/envs/DQ/lib/python3.8/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + }, { "output_type": "execute_result", "data": { "text/plain": [ - "{'performance_discrimination': {'race': Black 0.639949\n", - " Other 1.000000\n", - " Asian-Pac-Islander 0.510216\n", - " White 0.562961\n", - " Amer-Indian-Eskimo 1.000000\n", - " dtype: float64,\n", - " 'sex': Female 0.526722\n", - " Male 0.591663\n", - " dtype: float64},\n", + "{'performance_discrimination': \"[ERROR] Test failed to compute. Original exception: performance_per_feature_values() got an unexpected keyword argument 'target'\",\n", " 'proxy_identification': features\n", " relationship_sex 0.650656\n", " Name: association, dtype: float64,\n", - " 'sensitive_predictability': race 0.121680\n", - " sex 0.249346\n", - " dtype: float64,\n", + " 'sensitive_predictability': \"[ERROR] Test failed to compute. Original exception: baseline_performance() got an unexpected keyword argument 'target'\",\n", " 'sensitive_representativity': {'race': White 0.8537\n", " Black 0.0978\n", " Asian-Pac-Islander 0.0303\n", @@ -315,11 +330,20 @@ "name": "stdout", "text": [ "Warnings count by priority:\n", - "\tPriority 2: 2 warning(s)\n", - "\tTOTAL: 2 warning(s)\n", + "\tPriority 2: 3 warning(s)\n", + "\tTOTAL: 3 warning(s)\n", "List of warnings sorted by priority:\n", - "\t[FLATLINES] Found 4165 flatline events with a minimun length of 5 among the columns {'marital-status', 'workclass', 'income', 'native-country', 'capital-gain', 'capital-loss', 'education', 'occupation', 'sex', 'education-num', 'hours-per-week', 'relationship', 'race'}. (Priority 2: usage allowed, limited human intelligibility)\n", - "\t[PREDEFINED ERRONEOUS DATA] Found 1360 ED values in the dataset. (Priority 2: usage allowed, limited human intelligibility)\n" + "\t[HIGH COLLINEARITY - CATEGORICAL] Found 9 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove variables following the provided order. (Priority 2: usage allowed, limited human intelligibility)\n", + "\t[PREDEFINED ERRONEOUS DATA] Found 1360 ED values in the dataset. (Priority 2: usage allowed, limited human intelligibility)\n", + "\t[HIGH COLLINEARITY - NUMERICAL] Found 3 numerical variables with high Variance Inflation Factor (VIF>5.0). The variables listed in results are highly collinear with other variables in the dataset. These will make model explainability harder and potentially give way to issues like overfitting. Depending on your end goal you might want to remove the highest VIF variables. (Priority 2: usage allowed, limited human intelligibility)\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/fsantos/miniconda3/envs/DQ/lib/python3.8/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" ] } ], @@ -353,6 +377,14 @@ "List of warnings sorted by priority:\n", "\t[SENSITIVE ATTRIBUTE REPRESENTATIVITY] Found 2 values of 'race' sensitive attribute with low representativity in the dataset (below 1.00%). (Priority 2: usage allowed, limited human intelligibility)\n" ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/fsantos/GitRepos/ydata-quality/src/ydata_quality/bias_fairness/engine.py:72: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.\n", + " performances = pd.Series(index=self.sensitive_features)\n" + ] } ], "metadata": {} @@ -365,6 +397,14 @@ "better_bf.proxy_identification(th=0.45)" ], "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/home/fsantos/miniconda3/envs/DQ/lib/python3.8/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + }, { "output_type": "execute_result", "data": { @@ -394,7 +434,7 @@ "orig_nbformat": 4, "language_info": { "name": "python", - "version": "3.8.10", + "version": "3.8.11", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", @@ -406,10 +446,10 @@ }, "kernelspec": { "name": "python3", - "display_name": "Python 3.8.10 64-bit ('.venv': venv)" + "display_name": "Python 3.8.11 64-bit ('DQ': conda)" }, "interpreter": { - "hash": "cdc2bce73c2a9ac283f602628cabf735dbe06c4ee87a7849fc5f3d1177c8f304" + "hash": "e255f3ac955330aecee05fff6b7b15b68f4bd4cf0e9481cf0822c8a2e5228d43" } }, "nbformat": 4, From b5f2e58eadd23cb1780639e138ac70431f6862c5 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Tue, 21 Sep 2021 23:28:15 +0100 Subject: [PATCH 06/10] Add warning levels arg and defaults --- src/ydata_quality/core/data_quality.py | 11 ++++++++--- src/ydata_quality/core/engine.py | 9 +++++++-- src/ydata_quality/data_expectations/engine.py | 9 +++++++-- src/ydata_quality/data_relations/engine.py | 9 +++++++-- src/ydata_quality/utils/logger.py | 1 - 5 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py index 609ef5d7..ef5f7ed0 100644 --- a/src/ydata_quality/core/data_quality.py +++ b/src/ydata_quality/core/data_quality.py @@ -2,6 +2,7 @@ Implementation of main class for Data Quality checks. """ from collections import Counter +from logging import _nameToLevel from typing import Callable, List, Optional, Union import pandas as pd @@ -38,8 +39,8 @@ def __init__(self, corr_th: float = 0.8, vif_th: float = 5, p_th: float = 0.05, - plot: bool = True - ): + plot: bool = True, + severity: Optional[str]= 'ERROR'): """ Engines: - Duplicates @@ -72,11 +73,15 @@ def __init__(self, vif_th (float): [DATA RELATIONS] Variance Inflation Factor threshold for numerical independence test, typically 5-10 is recommended. Defaults to 5. p_th (float): [DATA RELATIONS] Fraction of the right tail of the chi squared CDF defining threshold for categorical independence test. Defaults to 0.05. plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output. + severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL] """ #TODO: Refactor legacy engines (property based) and logic in this class to new base (lean objects) self.df = df self._warnings = list() - self._logger = create_logger(NAME, STREAM, LOG_LEVEL) + if severity in _nameToLevel: + os.environ["DQ_LOG_LEVEL"] = severity + log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + self._logger = create_logger(NAME, STREAM, log_level) self._random_state = random_state self._engines_legacy = { # Default list of engines diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py index b40060d9..f669407e 100644 --- a/src/ydata_quality/core/engine.py +++ b/src/ydata_quality/core/engine.py @@ -4,6 +4,8 @@ from abc import ABC from collections import Counter from typing import Optional +import os +from logging import _nameToLevel import pandas as pd from numpy import random @@ -16,11 +18,14 @@ class QualityEngine(ABC): "Main class for running and storing data quality analysis." - def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None): + def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None, severity: Optional[str]= None): self._df = df self._df_type = None self._warnings = list() - self._logger = create_logger(NAME, STREAM, LOG_LEVEL) + if severity in _nameToLevel: + os.environ["DQ_LOG_LEVEL"] = severity + log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + self._logger = create_logger(NAME, STREAM, log_level) self._tests = [] self._label = label self._dtypes = dtypes diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py index 72a55431..82468f5f 100644 --- a/src/ydata_quality/data_expectations/engine.py +++ b/src/ydata_quality/data_expectations/engine.py @@ -2,6 +2,7 @@ Implementation of DataExpectationsReporter engine to run data expectations validation analysis. """ from typing import Optional +from logging import _nameToLevel import numpy as np import pandas as pd @@ -16,9 +17,13 @@ class DataExpectationsReporter(QualityEngine): Supports standard Great Expectations json reports from expectation suite validation runs. """ - def __init__(self): # Overrides base class init + def __init__(self, severity: Optional[str]= None): # Overrides base class init + "severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]" self._warnings = [] # reset the warnings to avoid duplicates - self._logger = create_logger(NAME, STREAM, LOG_LEVEL) + if severity in _nameToLevel: + os.environ["DQ_LOG_LEVEL"] = severity + log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + self._logger = create_logger(NAME, STREAM, log_level) @property def tests(self): diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py index 823748ac..3e6016da 100644 --- a/src/ydata_quality/data_relations/engine.py +++ b/src/ydata_quality/data_relations/engine.py @@ -2,6 +2,7 @@ Implementation of DataRelationsDetector engine to run data relations analysis. """ from typing import Optional, Tuple, List +from logging import _nameToLevel import numpy as np import pandas as pd @@ -15,9 +16,13 @@ class DataRelationsDetector(QualityEngine): """Main class to run data relations analysis. """ - def __init__(self): # Overrides base class init + def __init__(self, severity: Optional[str] = None): # Overrides base class init + "severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]" self._warnings = [] # reset the warnings to avoid duplicates - self._logger = create_logger(NAME, STREAM, LOG_LEVEL) + if severity in _nameToLevel: + os.environ["DQ_LOG_LEVEL"] = severity + log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + self._logger = create_logger(NAME, STREAM, log_level) @property def tests(self): diff --git a/src/ydata_quality/utils/logger.py b/src/ydata_quality/utils/logger.py index 24a2474e..05dba78a 100644 --- a/src/ydata_quality/utils/logger.py +++ b/src/ydata_quality/utils/logger.py @@ -6,7 +6,6 @@ # Default vars for the logger NAME = os.getenv('DQ_LOGGER_NAME', 'DQ_Logger') STREAM = sys.stdout -LOG_LEVEL = os.getenv('DQ_LOG_LEVEL', logging.WARNING) def create_logger(name, stream: TextIO = sys.stdout, level=logging.INFO): handler = logging.StreamHandler(stream) From 3f7a3751093a0d51febe99d505433c6f863856f4 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Wed, 22 Sep 2021 00:00:56 +0100 Subject: [PATCH 07/10] missing imports --- src/ydata_quality/core/data_quality.py | 5 +++-- src/ydata_quality/core/engine.py | 6 +++--- src/ydata_quality/data_expectations/engine.py | 5 +++-- src/ydata_quality/data_relations/engine.py | 5 +++-- src/ydata_quality/duplicates/engine.py | 1 + src/ydata_quality/erroneous_data/engine.py | 2 +- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py index ef5f7ed0..d475fd02 100644 --- a/src/ydata_quality/core/data_quality.py +++ b/src/ydata_quality/core/data_quality.py @@ -3,6 +3,7 @@ """ from collections import Counter from logging import _nameToLevel +import os from typing import Callable, List, Optional, Union import pandas as pd @@ -16,7 +17,7 @@ from ydata_quality.data_expectations import DataExpectationsReporter from ydata_quality.bias_fairness import BiasFairness from ydata_quality.data_relations import DataRelationsDetector -from ydata_quality.utils.logger import * +from ydata_quality.utils.logger import create_logger, NAME, STREAM class DataQuality: "DataQuality contains the multiple data quality engines." @@ -80,7 +81,7 @@ def __init__(self, self._warnings = list() if severity in _nameToLevel: os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) self._logger = create_logger(NAME, STREAM, log_level) self._random_state = random_state diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py index f669407e..be9c4524 100644 --- a/src/ydata_quality/core/engine.py +++ b/src/ydata_quality/core/engine.py @@ -12,7 +12,7 @@ from ydata_quality.core.warnings import Priority, QualityWarning from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes -from ydata_quality.utils.logger import * +from ydata_quality.utils.logger import create_logger, NAME, STREAM class QualityEngine(ABC): @@ -24,7 +24,7 @@ def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: self._warnings = list() if severity in _nameToLevel: os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) self._logger = create_logger(NAME, STREAM, log_level) self._tests = [] self._label = label @@ -143,6 +143,6 @@ def evaluate(self): try: # if anything fails results[test] = getattr(self, test)() except Exception as exc: # print a Warning and log the message - self._logger.warning('Skipping test due to failure during computation. See results folder of this test for further details.') + self._logger.warning('Skipping %s due to failure during computation. See results folder of this test for further details.', test) results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}" return results diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py index 82468f5f..295490a7 100644 --- a/src/ydata_quality/data_expectations/engine.py +++ b/src/ydata_quality/data_expectations/engine.py @@ -1,6 +1,7 @@ """ Implementation of DataExpectationsReporter engine to run data expectations validation analysis. """ +import os from typing import Optional from logging import _nameToLevel @@ -9,7 +10,7 @@ from ydata_quality.core import QualityEngine, QualityWarning from ydata_quality.utils.auxiliary import test_load_json_path -from ydata_quality.utils.logger import * +from ydata_quality.utils.logger import create_logger, NAME, STREAM class DataExpectationsReporter(QualityEngine): @@ -22,7 +23,7 @@ def __init__(self, severity: Optional[str]= None): # Overrides base class init self._warnings = [] # reset the warnings to avoid duplicates if severity in _nameToLevel: os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) self._logger = create_logger(NAME, STREAM, log_level) @property diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py index 3e6016da..c3fa5673 100644 --- a/src/ydata_quality/data_relations/engine.py +++ b/src/ydata_quality/data_relations/engine.py @@ -1,6 +1,7 @@ """ Implementation of DataRelationsDetector engine to run data relations analysis. """ +import os from typing import Optional, Tuple, List from logging import _nameToLevel @@ -10,7 +11,7 @@ from ydata_quality.core import QualityEngine, QualityWarning from ydata_quality.utils.correlations import correlation_matrix, partial_correlation_matrix, correlation_plotter, vif_collinearity, chi2_collinearity from ydata_quality.utils.auxiliary import infer_dtypes, standard_normalize -from ydata_quality.utils.logger import * +from ydata_quality.utils.logger import create_logger, NAME, STREAM class DataRelationsDetector(QualityEngine): """Main class to run data relations analysis. @@ -21,7 +22,7 @@ def __init__(self, severity: Optional[str] = None): # Overrides base class init self._warnings = [] # reset the warnings to avoid duplicates if severity in _nameToLevel: os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', logging.INFO) + log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) self._logger = create_logger(NAME, STREAM, log_level) @property diff --git a/src/ydata_quality/duplicates/engine.py b/src/ydata_quality/duplicates/engine.py index ffb8616c..184be425 100644 --- a/src/ydata_quality/duplicates/engine.py +++ b/src/ydata_quality/duplicates/engine.py @@ -15,6 +15,7 @@ class DuplicateChecker(QualityEngine): def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = [], is_close: bool=False): """ + Arguments: df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis. entities (List[Union[str, List[str]]]): entities relevant for duplicate analysis. Passing lists allows composed entities of multiple columns. is_close (bool): Pass True to use numpy.isclose instead of pandas.equals in column comparison.""" diff --git a/src/ydata_quality/erroneous_data/engine.py b/src/ydata_quality/erroneous_data/engine.py index 25b6b7ff..f3a3884c 100644 --- a/src/ydata_quality/erroneous_data/engine.py +++ b/src/ydata_quality/erroneous_data/engine.py @@ -83,7 +83,7 @@ def flatlines(self, th: int=5, skip: list=[]): skip: List of columns that will not be target of search for flatlines. Pass '__index' inside skip list to skip looking for flatlines at the index.""" if self.df_type == DataFrameType.TABULAR: - self._logger.info('The provided DataFrame is not a valid Timeseries type, skipping this test.') + self._logger.debug('The provided DataFrame is not a valid Timeseries type, skipping flatlines test.') return None flatlines = {} for column in self.df.columns: # Compile flatline index From 47283659739f0c88ad3de05c954d4d076e0eccd9 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Wed, 22 Sep 2021 10:48:20 +0100 Subject: [PATCH 08/10] simplify logger instantiation --- src/ydata_quality/bias_fairness/engine.py | 10 +++--- src/ydata_quality/core/data_quality.py | 31 +++++++++---------- src/ydata_quality/core/engine.py | 9 ++---- src/ydata_quality/data_expectations/engine.py | 9 ++---- src/ydata_quality/data_relations/engine.py | 9 ++---- src/ydata_quality/drift/engine.py | 5 +-- src/ydata_quality/duplicates/engine.py | 7 +++-- src/ydata_quality/erroneous_data/engine.py | 5 +-- src/ydata_quality/labelling/engine.py | 31 ++++++++++++------- src/ydata_quality/missings/engine.py | 5 +-- src/ydata_quality/utils/logger.py | 9 ++++-- 11 files changed, 66 insertions(+), 64 deletions(-) diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py index 5c63ff09..2481acfe 100644 --- a/src/ydata_quality/bias_fairness/engine.py +++ b/src/ydata_quality/bias_fairness/engine.py @@ -22,14 +22,16 @@ class BiasFairness(QualityEngine): """ def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None, - random_state: Optional[int] = None): + random_state: Optional[int] = None, severity: Optional[str]= None): """ Args df (pd.DataFrame): reference DataFrame used to run the analysis sensitive_features (List[str]): features deemed as sensitive attributes label (str, optional): target feature to be predicted + severity (str, optional): Sets the logger warning threshold to one of the valid levels + [DEBUG, INFO, WARNING, ERROR, CRITICAL] """ - super().__init__(df=df, label=label, random_state=random_state) + super().__init__(df=df, label=label, random_state=random_state, severity=severity) self._sensitive_features = sensitive_features self._tests = ["performance_discrimination", "proxy_identification", "sensitive_predictability", "sensitive_representativity"] @@ -72,7 +74,7 @@ def sensitive_predictability(self, th=0.5, adjusted_metric=True): performances = pd.Series(index=self.sensitive_features) for feat in performances.index: data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target - performances[feat] = baseline_performance(df=data, target=feat, adjusted_metric=adjusted_metric) + performances[feat] = baseline_performance(df=data, label=feat, adjusted_metric=adjusted_metric) high_perfs = performances[performances>th] if len(high_perfs) > 0: @@ -97,7 +99,7 @@ def performance_discrimination(self): res = {} for feat in self.sensitive_features: - res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, target=self.label)) + res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, label=self.label)) return res diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py index d475fd02..1558a3df 100644 --- a/src/ydata_quality/core/data_quality.py +++ b/src/ydata_quality/core/data_quality.py @@ -2,8 +2,6 @@ Implementation of main class for Data Quality checks. """ from collections import Counter -from logging import _nameToLevel -import os from typing import Callable, List, Optional, Union import pandas as pd @@ -17,7 +15,7 @@ from ydata_quality.data_expectations import DataExpectationsReporter from ydata_quality.bias_fairness import BiasFairness from ydata_quality.data_relations import DataRelationsDetector -from ydata_quality.utils.logger import create_logger, NAME, STREAM +from ydata_quality.utils.logger import get_logger, NAME class DataQuality: "DataQuality contains the multiple data quality engines." @@ -41,7 +39,7 @@ def __init__(self, vif_th: float = 5, p_th: float = 0.05, plot: bool = True, - severity: Optional[str]= 'ERROR'): + severity: str= 'ERROR'): """ Engines: - Duplicates @@ -74,25 +72,22 @@ def __init__(self, vif_th (float): [DATA RELATIONS] Variance Inflation Factor threshold for numerical independence test, typically 5-10 is recommended. Defaults to 5. p_th (float): [DATA RELATIONS] Fraction of the right tail of the chi squared CDF defining threshold for categorical independence test. Defaults to 0.05. plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output. - severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL] + severity (str): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL] """ #TODO: Refactor legacy engines (property based) and logic in this class to new base (lean objects) self.df = df self._warnings = list() - if severity in _nameToLevel: - os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) - self._logger = create_logger(NAME, STREAM, log_level) + self._logger = get_logger(NAME, level=severity) self._random_state = random_state self._engines_legacy = { # Default list of engines - 'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close), - 'missings': MissingsProfiler(df=df, label=label, random_state=self.random_state), - 'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions), - 'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state) + 'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close, severity=severity), + 'missings': MissingsProfiler(df=df, label=label, random_state=self.random_state, severity=severity), + 'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions, severity=severity), + 'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state, severity=severity) } - self._engines_new = {'data-relations': DataRelationsDetector()} + self._engines_new = {'data-relations': DataRelationsDetector(severity=severity)} self._eval_args = { # Argument lists for different engines # TODO: centralize shared args in a dictionary to pass just like a regular kwargs to engines, pass specific args in arg list (define here) # In new standard all engines can be run at the evaluate method only, the evaluate run expression can then be: @@ -103,16 +98,18 @@ def __init__(self, # Engines based on mandatory arguments if label is not None: - self._engines_legacy['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state) + self._engines_legacy['labelling'] = LabelInspector(df=df, label=label, + random_state=self.random_state, severity=severity) else: self._logger.warning('Label is not defined. Skipping LABELLING engine.') if len(sensitive_features)>0: self._engines_legacy['bias&fairness'] = BiasFairness(df=df, sensitive_features=sensitive_features, - label=label, random_state=self.random_state) + label=label, random_state=self.random_state, + severity=severity) else: self._logger.warning('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.') if results_json_path is not None: - self._engines_new['expectations'] = DataExpectationsReporter() + self._engines_new['expectations'] = DataExpectationsReporter(severity=severity) else: self._logger.warning('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.') diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py index be9c4524..cab7d037 100644 --- a/src/ydata_quality/core/engine.py +++ b/src/ydata_quality/core/engine.py @@ -4,15 +4,13 @@ from abc import ABC from collections import Counter from typing import Optional -import os -from logging import _nameToLevel import pandas as pd from numpy import random from ydata_quality.core.warnings import Priority, QualityWarning from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes -from ydata_quality.utils.logger import create_logger, NAME, STREAM +from ydata_quality.utils.logger import get_logger, NAME class QualityEngine(ABC): @@ -22,10 +20,7 @@ def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: self._df = df self._df_type = None self._warnings = list() - if severity in _nameToLevel: - os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) - self._logger = create_logger(NAME, STREAM, log_level) + self._logger = get_logger(NAME, level=severity) self._tests = [] self._label = label self._dtypes = dtypes diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py index 295490a7..1cdd8e82 100644 --- a/src/ydata_quality/data_expectations/engine.py +++ b/src/ydata_quality/data_expectations/engine.py @@ -1,16 +1,14 @@ """ Implementation of DataExpectationsReporter engine to run data expectations validation analysis. """ -import os from typing import Optional -from logging import _nameToLevel import numpy as np import pandas as pd from ydata_quality.core import QualityEngine, QualityWarning from ydata_quality.utils.auxiliary import test_load_json_path -from ydata_quality.utils.logger import create_logger, NAME, STREAM +from ydata_quality.utils.logger import get_logger, NAME class DataExpectationsReporter(QualityEngine): @@ -21,10 +19,7 @@ class DataExpectationsReporter(QualityEngine): def __init__(self, severity: Optional[str]= None): # Overrides base class init "severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]" self._warnings = [] # reset the warnings to avoid duplicates - if severity in _nameToLevel: - os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) - self._logger = create_logger(NAME, STREAM, log_level) + self._logger = get_logger(NAME, level=severity) @property def tests(self): diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py index c3fa5673..28f3a47b 100644 --- a/src/ydata_quality/data_relations/engine.py +++ b/src/ydata_quality/data_relations/engine.py @@ -1,9 +1,7 @@ """ Implementation of DataRelationsDetector engine to run data relations analysis. """ -import os from typing import Optional, Tuple, List -from logging import _nameToLevel import numpy as np import pandas as pd @@ -11,7 +9,7 @@ from ydata_quality.core import QualityEngine, QualityWarning from ydata_quality.utils.correlations import correlation_matrix, partial_correlation_matrix, correlation_plotter, vif_collinearity, chi2_collinearity from ydata_quality.utils.auxiliary import infer_dtypes, standard_normalize -from ydata_quality.utils.logger import create_logger, NAME, STREAM +from ydata_quality.utils.logger import get_logger, NAME class DataRelationsDetector(QualityEngine): """Main class to run data relations analysis. @@ -20,10 +18,7 @@ class DataRelationsDetector(QualityEngine): def __init__(self, severity: Optional[str] = None): # Overrides base class init "severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]" self._warnings = [] # reset the warnings to avoid duplicates - if severity in _nameToLevel: - os.environ["DQ_LOG_LEVEL"] = severity - log_level = os.getenv('DQ_LOG_LEVEL', _nameToLevel['INFO']) - self._logger = create_logger(NAME, STREAM, log_level) + self._logger = get_logger(NAME, level=severity) @property def tests(self): diff --git a/src/ydata_quality/drift/engine.py b/src/ydata_quality/drift/engine.py index 1edeb219..640c4b18 100644 --- a/src/ydata_quality/drift/engine.py +++ b/src/ydata_quality/drift/engine.py @@ -70,7 +70,7 @@ class DriftAnalyser(QualityEngine): def __init__(self, ref: pd.DataFrame, sample: Optional[pd.DataFrame] = None, label: Optional[str] = None, model: Optional[Union[Callable, ModelWrapper]] = None, holdout: float = 0.2, - random_state: Optional[int] = None): + random_state: Optional[int] = None, severity:Optional[str]=None): """ Initializes the engine properties and lists tests for automated evaluation. Args: @@ -84,8 +84,9 @@ def __init__(self, ref: pd.DataFrame, sample: Optional[pd.DataFrame] = None, holdout (float): Fraction to be kept as holdout for drift test. random_state (Optional, int): Seed used to guarantee reproducibility of the random sample splits. Pass None for no reproducibility. + severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL] """ - super().__init__(df=ref, label=label, random_state=random_state) + super().__init__(df=ref, label=label, random_state=random_state, severity=severity) self.sample = sample self.model = model self._holdout, self._remaining_data = random_split(ref, holdout, random_state=self.random_state) diff --git a/src/ydata_quality/duplicates/engine.py b/src/ydata_quality/duplicates/engine.py index 184be425..d0278684 100644 --- a/src/ydata_quality/duplicates/engine.py +++ b/src/ydata_quality/duplicates/engine.py @@ -13,13 +13,14 @@ class DuplicateChecker(QualityEngine): "Engine for running analyis on duplicate records." - def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = [], is_close: bool=False): + def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = [], is_close: bool=False, severity: Optional[str]= None): """ Arguments: df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis. entities (List[Union[str, List[str]]]): entities relevant for duplicate analysis. Passing lists allows composed entities of multiple columns. - is_close (bool): Pass True to use numpy.isclose instead of pandas.equals in column comparison.""" - super().__init__(df=df) + is_close (bool): Pass True to use numpy.isclose instead of pandas.equals in column comparison. + severity (str): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]""" + super().__init__(df=df, severity=severity) self._entities = entities self._tests = ["exact_duplicates", "entity_duplicates", "duplicate_columns"] self._is_close = is_close diff --git a/src/ydata_quality/erroneous_data/engine.py b/src/ydata_quality/erroneous_data/engine.py index f3a3884c..ab9483be 100644 --- a/src/ydata_quality/erroneous_data/engine.py +++ b/src/ydata_quality/erroneous_data/engine.py @@ -13,13 +13,14 @@ class ErroneousDataIdentifier(QualityEngine): "Engine for running analysis on erroneous data." - def __init__(self, df: pd.DataFrame, ed_extensions: Optional[list]=[]): + def __init__(self, df: pd.DataFrame, ed_extensions: Optional[list]=[], severity:Optional[str]=None): """ Args: df (pd.DataFrame): DataFrame used to run the erroneous data analysis. ed_extensions: A list of user provided erroneous data values to append to defaults. + severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL] """ - super().__init__(df=df) + super().__init__(df=df, severity=severity) if self.df_type == DataFrameType.TIMESERIES: self._tests = ["flatlines", "predefined_erroneous_data"] else: diff --git a/src/ydata_quality/labelling/engine.py b/src/ydata_quality/labelling/engine.py index 98585ab4..32c26e67 100644 --- a/src/ydata_quality/labelling/engine.py +++ b/src/ydata_quality/labelling/engine.py @@ -14,20 +14,29 @@ standard_transform) -def LabelInspector(df, label, random_state: Optional[int]=None): - """Instantiate this label inspector class. - Runs a label type inference to instantiate the correct label inspector.""" +def LabelInspector(df, label, random_state: Optional[int]=None, severity:Optional[str]=None): + """Runs a label type inference to instantiate the correct label inspector. + Instantiate this label inspector method to create a Label Inspector. + + Arguments: + df (pd.DataFrame): reference DataFrame used to run the label analysis. + label (str, optional): target feature to be predicted. + random_state (int, optional): Integer seed for random reproducibility. Default is None. + Set to None for fully random behavior, no reproducibility. + severity (str, optional): Sets the logger warning threshold to one of the valid levels + [DEBUG, INFO, WARNING, ERROR, CRITICAL] + """ label_dtype = infer_dtypes(df[label])[label] # Label column dtype inferral if label_dtype == 'categorical': - return CategoricalLabelInspector(df, label, random_state=random_state) + return CategoricalLabelInspector(df, label, random_state=random_state, severity=severity) else: - return NumericalLabelInspector(df, label, random_state=random_state) + return NumericalLabelInspector(df, label, random_state=random_state, severity=severity) class SharedLabelInspector(QualityEngine): """Shared structure for Numerical/Categorical Label Inspector""" - def __init__(self, df: pd.DataFrame, label: str, random_state: Optional[int]=None): - super().__init__(df=df, label=label, random_state=random_state) + def __init__(self, df: pd.DataFrame, label: str, random_state: Optional[int]=None, severity:Optional[str]=None): + super().__init__(df=df, label=label, random_state=random_state, severity=severity) self._tdf = None @property @@ -70,8 +79,8 @@ class CategoricalLabelInspector(SharedLabelInspector): """Engine for running analysis on categorical labels. Ordinal labels can be handled if passed as categorical.""" - def __init__(self, df: pd.DataFrame, label: str, random_state: Optional[int]): - super().__init__(df=df, label=label, random_state=random_state) + def __init__(self, df: pd.DataFrame, label: str, random_state: Optional[int], severity:Optional[str]=None): + super().__init__(df=df, label=label, random_state=random_state, severity = severity) self._centroids = None self._tests = ["missing_labels", "few_labels", "unbalanced_classes", "one_vs_rest_performance", "outlier_detection"] @@ -235,8 +244,8 @@ def outlier_detection(self, th=3): class NumericalLabelInspector(SharedLabelInspector): "Engine for running analyis on numerical labels." - def __init__(self, df: pd.DataFrame, label: str, random_state): - super().__init__(df=df, label=label, random_state=random_state) + def __init__(self, df: pd.DataFrame, label: str, random_state, severity:Optional[str]=None): + super().__init__(df=df, label=label, random_state=random_state, severity = severity) self._tests = ["missing_labels", "test_normality", "outlier_detection"] def _GMM_clusters(self, max_clusters): diff --git a/src/ydata_quality/missings/engine.py b/src/ydata_quality/missings/engine.py index 992e0af0..b1949297 100644 --- a/src/ydata_quality/missings/engine.py +++ b/src/ydata_quality/missings/engine.py @@ -15,15 +15,16 @@ class MissingsProfiler(QualityEngine): "Main class to run missing value analysis." - def __init__(self, df: pd.DataFrame, label: Optional[str] = None, random_state: Optional[int]=None): + def __init__(self, df: pd.DataFrame, label: Optional[str] = None, random_state: Optional[int]=None, severity: Optional[str]= None): """ Args: df (pd.DataFrame): reference DataFrame used to run the missing value analysis. label (str, optional): target feature to be predicted. random_state (int, optional): Integer seed for random reproducibility. Default is None. Set to None for fully random behavior, no reproducibility. + severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL] """ - super().__init__(df=df, random_state=random_state) + super().__init__(df=df, random_state=random_state, severity=severity) self._label = label self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"] diff --git a/src/ydata_quality/utils/logger.py b/src/ydata_quality/utils/logger.py index 05dba78a..0057eddf 100644 --- a/src/ydata_quality/utils/logger.py +++ b/src/ydata_quality/utils/logger.py @@ -2,12 +2,16 @@ from typing import TextIO import sys import os +from logging import _nameToLevel # Default vars for the logger NAME = os.getenv('DQ_LOGGER_NAME', 'DQ_Logger') -STREAM = sys.stdout -def create_logger(name, stream: TextIO = sys.stdout, level=logging.INFO): +def get_logger(name, stream: TextIO = sys.stdout, level: str=logging.INFO): + acceptable_levels = [None]+list(_nameToLevel.keys()) + assert level in acceptable_levels, "Valid levels for warning severity are {}. Defaults to info level.".format(acceptable_levels) + if not level: + level = logging.INFO # Default threshold handler = logging.StreamHandler(stream) handler.setFormatter( logging.Formatter( @@ -22,3 +26,4 @@ def create_logger(name, stream: TextIO = sys.stdout, level=logging.INFO): logger.propagate = False return logger + From 4c592726fdf03ccc349cfed3135a78a9bf6358e6 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Wed, 22 Sep 2021 10:53:00 +0100 Subject: [PATCH 09/10] Fix unnecessary setting of label outside of super --- src/ydata_quality/missings/engine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ydata_quality/missings/engine.py b/src/ydata_quality/missings/engine.py index b1949297..b20bf5d9 100644 --- a/src/ydata_quality/missings/engine.py +++ b/src/ydata_quality/missings/engine.py @@ -24,8 +24,7 @@ def __init__(self, df: pd.DataFrame, label: Optional[str] = None, random_state: Set to None for fully random behavior, no reproducibility. severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL] """ - super().__init__(df=df, random_state=random_state, severity=severity) - self._label = label + super().__init__(df=df, random_state=random_state, label=label, severity=severity) self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"] def _get_null_cols(self, col: Optional[str] = None) -> List[str]: From 59cb4e41935f3a0898de2866437db92a655dcd60 Mon Sep 17 00:00:00 2001 From: Francisco Santos Date: Wed, 22 Sep 2021 11:58:35 +0100 Subject: [PATCH 10/10] up a log to warning --- src/ydata_quality/duplicates/engine.py | 2 +- src/ydata_quality/utils/logger.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ydata_quality/duplicates/engine.py b/src/ydata_quality/duplicates/engine.py index d0278684..38e81ddf 100644 --- a/src/ydata_quality/duplicates/engine.py +++ b/src/ydata_quality/duplicates/engine.py @@ -98,7 +98,7 @@ def entity_duplicates(self, entity: Optional[Union[str, List[str]]] = None): ent_dups.setdefault(entity_key, {})[val] = dups[(dups[entity].values==val).all(axis=1)] else: # if entity is not specified if len(self.entities) == 0: - self._logger.info("There are no entities defined to run the analysis. Skipping the test.") + self._logger.warning("There are no entities defined to run the analysis. Skipping the test.") return None else: for col in self.entities: diff --git a/src/ydata_quality/utils/logger.py b/src/ydata_quality/utils/logger.py index 0057eddf..2801bb23 100644 --- a/src/ydata_quality/utils/logger.py +++ b/src/ydata_quality/utils/logger.py @@ -9,7 +9,7 @@ def get_logger(name, stream: TextIO = sys.stdout, level: str=logging.INFO): acceptable_levels = [None]+list(_nameToLevel.keys()) - assert level in acceptable_levels, "Valid levels for warning severity are {}. Defaults to info level.".format(acceptable_levels) + assert level in acceptable_levels, f"Valid levels for warning severity are {acceptable_levels}. Defaults to info level." if not level: level = logging.INFO # Default threshold handler = logging.StreamHandler(stream)