ydataai · jfsantos-ds · Sep 22, 2021 · Sep 21, 2021 · Sep 21, 2021 · Sep 21, 2021
diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py
@@ -22,14 +22,16 @@ class BiasFairness(QualityEngine):
     """
 
     def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None,
-        random_state: Optional[int] = None):
+        random_state: Optional[int] = None, severity: Optional[str]= None):
         """
         Args
             df (pd.DataFrame): reference DataFrame used to run the analysis
             sensitive_features (List[str]): features deemed as sensitive attributes
             label (str, optional): target feature to be predicted
+            severity (str, optional): Sets the logger warning threshold to one of the valid levels
+                [DEBUG, INFO, WARNING, ERROR, CRITICAL]
         """
-        super().__init__(df=df, label=label, random_state=random_state)
+        super().__init__(df=df, label=label, random_state=random_state, severity=severity)
         self._sensitive_features = sensitive_features
         self._tests = ["performance_discrimination", "proxy_identification",
                         "sensitive_predictability", "sensitive_representativity"]
@@ -72,7 +74,7 @@ def sensitive_predictability(self, th=0.5, adjusted_metric=True):
         performances = pd.Series(index=self.sensitive_features)
         for feat in performances.index:
             data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target
-            performances[feat] = baseline_performance(df=data, target=feat, adjusted_metric=adjusted_metric)
+            performances[feat] = baseline_performance(df=data, label=feat, adjusted_metric=adjusted_metric)
 
         high_perfs = performances[performances>th]
         if len(high_perfs) > 0:
@@ -93,12 +95,11 @@ def performance_discrimination(self):
         """
         # TODO: support error rate parity metrics (e.g. false positive rate, positive rate)
         if self.label is None:
-            print('[BIAS&FAIRNESS] Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
-            pass
+            self._logger.warning('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
 
         res = {}
         for feat in self.sensitive_features:
-            res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, target=self.label))
+            res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, label=self.label))
         return res
 
 

diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py
@@ -15,6 +15,7 @@
 from ydata_quality.data_expectations import DataExpectationsReporter
 from ydata_quality.bias_fairness import BiasFairness
 from ydata_quality.data_relations import DataRelationsDetector
+from ydata_quality.utils.logger import get_logger, NAME
 
 class DataQuality:
     "DataQuality contains the multiple data quality engines."
@@ -37,8 +38,8 @@ def __init__(self,
                     corr_th: float = 0.8,
                     vif_th: float = 5,
                     p_th: float = 0.05,
-                    plot: bool = True
-                    ):
+                    plot: bool = True,
+                    severity: str= 'ERROR'):
         """
         Engines:
         - Duplicates
@@ -55,7 +56,7 @@ def __init__(self,
             label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted.
                                     If not specified, LABELLING is skipped.
             random_state (int, optional): Integer seed for random reproducibility. Default is None.
-                Set to None for fully random behaviour, no reproducibility.
+                Set to None for fully random behavior, no reproducibility.
             entities: [DUPLICATES] entities relevant for duplicate analysis.
             is_close: [DUPLICATES] Pass True to use numpy.isclose instead of pandas.equals in column comparison.
             ed_extensions: [ERRONEOUS DATA] A list of user provided erroneous data values to append to defaults.
@@ -71,19 +72,22 @@ def __init__(self,
             vif_th (float): [DATA RELATIONS] Variance Inflation Factor threshold for numerical independence test, typically 5-10 is recommended. Defaults to 5.
             p_th (float): [DATA RELATIONS] Fraction of the right tail of the chi squared CDF defining threshold for categorical independence test. Defaults to 0.05.
             plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
+            severity (str): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]
         """
         #TODO: Refactor legacy engines (property based) and logic in this class to new base (lean objects)
         self.df = df
         self._warnings = list()
+        self._logger = get_logger(NAME, level=severity)
         self._random_state = random_state
+
         self._engines_legacy = { # Default list of engines
-            'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close),
-            'missings': MissingsProfiler(df=df, target=label, random_state=self.random_state),
-            'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions),
-            'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state)
+            'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close, severity=severity),
+            'missings': MissingsProfiler(df=df, label=label, random_state=self.random_state, severity=severity),
+            'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions, severity=severity),
+            'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state, severity=severity)
         }
 
-        self._engines_new = {'data-relations': DataRelationsDetector()}
+        self._engines_new = {'data-relations': DataRelationsDetector(severity=severity)}
         self._eval_args = { # Argument lists for different engines
         # TODO: centralize shared args in a dictionary to pass just like a regular kwargs to engines, pass specific args in arg list (define here)
         # In new standard all engines can be run at the evaluate method only, the evaluate run expression can then be:
@@ -94,18 +98,20 @@ def __init__(self,
 
         # Engines based on mandatory arguments
         if label is not None:
-            self._engines_legacy['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state)
+            self._engines_legacy['labelling'] = LabelInspector(df=df, label=label,
+                                                               random_state=self.random_state, severity=severity)
         else:
-            print('Label is not defined. Skipping LABELLING engine.')
+            self._logger.warning('Label is not defined. Skipping LABELLING engine.')
         if len(sensitive_features)>0:
             self._engines_legacy['bias&fairness'] = BiasFairness(df=df, sensitive_features=sensitive_features,
-                                                                 label=label, random_state=self.random_state)
+                                                                 label=label, random_state=self.random_state,
+                                                                 severity=severity)
         else:
-            print('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
+            self._logger.warning('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
         if results_json_path is not None:
-            self._engines_new['expectations'] = DataExpectationsReporter()
+            self._engines_new['expectations'] = DataExpectationsReporter(severity=severity)
         else:
-            print('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')
+            self._logger.warning('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')
 
 
     def __clean_warnings(self):
@@ -140,7 +146,7 @@ def random_state(self, new_state):
         if new_state==None or (isinstance(new_state, int) and new_state>=0):
             self._random_state = new_state
         else:
-            print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
+            self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
             self._random_state = None
 
     def __store_warnings(self):
@@ -158,7 +164,7 @@ def report(self):
         self.__store_warnings() # fetch all warnings from the engines
         self.__clean_warnings()
         if not self._warnings:
-            print('No warnings found.')
+            self._logger.info('No warnings found.')
         else:
             prio_counts = Counter([warn.priority.value for warn in self._warnings])
             print('Warnings count by priority:')

diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py
@@ -10,16 +10,17 @@
 
 from ydata_quality.core.warnings import Priority, QualityWarning
 from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes
-from ydata_quality.utils.enum import DataFrameType
+from ydata_quality.utils.logger import get_logger, NAME
 
 
 class QualityEngine(ABC):
     "Main class for running and storing data quality analysis."
 
-    def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None):
+    def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None, severity: Optional[str]= None):
         self._df = df
         self._df_type = None
         self._warnings = list()
+        self._logger = get_logger(NAME, level=severity)
         self._tests = []
         self._label = label
         self._dtypes = dtypes
@@ -37,9 +38,8 @@ def label(self):
 
     @label.setter
     def label(self, label: str):
-        if not isinstance(label, str):
-            raise ValueError("Property 'label' should be a string.")
-        assert label in self.df.columns, "Given label should exist as a DataFrame column."
+        assert isinstance(label, str), "Property 'label' should be a string."
+        assert label in self.df.columns, "Provided label %s does not exist as a DataFrame column." % label
         self._label = label
 
     @property
@@ -52,11 +52,16 @@ def dtypes(self):
     @dtypes.setter
     def dtypes(self, dtypes: dict):
         if not isinstance(dtypes, dict):
-            raise ValueError("Property 'dtypes' should be a dictionary.")
-        assert all(col in self.df.columns for col in dtypes), "All dtypes keys must be columns in the dataset."
+            self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
+            dtypes = {}
+        cols_not_in_df = [col for col in dtypes if col not in self.df.columns]
+        if len(cols_not_in_df) > 0:
+            self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
         supported_dtypes = ['numerical', 'categorical']
-        assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes must be in the supported \
-broad dtype list: {}.".format(supported_dtypes)
+        wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
+        if len(wrong_dtypes>0):
+            self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
+        dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
         df_col_set = set(self.df.columns)
         dtypes_col_set = set(dtypes.keys())
         missing_cols = df_col_set.difference(dtypes_col_set)
@@ -85,7 +90,7 @@ def random_state(self, new_state):
             self._random_state = new_state
             random.seed(self.random_state)
         except:
-            print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
+            self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
             self._random_state = None
 
     def __clean_warnings(self):
@@ -116,7 +121,7 @@ def report(self):
         "Prints a report containing all the warnings detected during the data quality analysis."
         self.__clean_warnings()
         if not self._warnings:
-            print('No warnings found.')
+            self._logger.info('No warnings found.')
         else:
             prio_counts = Counter([warn.priority.value for warn in self._warnings])
             print('Warnings count by priority:')
@@ -133,6 +138,6 @@ def evaluate(self):
             try: # if anything fails
                 results[test] = getattr(self, test)()
             except Exception as exc: # print a Warning and log the message
-                print(f'WARNING: Skipping test {test} due to failure during computation.')
+                self._logger.warning('Skipping %s due to failure during computation. See results folder of this test for further details.', test)
                 results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}"
         return results
diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py
@@ -8,15 +8,18 @@
 
 from ydata_quality.core import QualityEngine, QualityWarning
 from ydata_quality.utils.auxiliary import test_load_json_path
+from ydata_quality.utils.logger import get_logger, NAME
 
 
 class DataExpectationsReporter(QualityEngine):
     """Main class to run data expectation validation analysis.
     Supports standard Great Expectations json reports from expectation suite validation runs.
     """
 
-    def __init__(self):
-        return  # Override the base class init method
+    def __init__(self, severity: Optional[str]= None):  # Overrides base class init
+        "severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]"
+        self._warnings = [] # reset the warnings to avoid duplicates
+        self._logger = get_logger(NAME, level=severity)
 
     @property
     def tests(self):
@@ -175,18 +178,17 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
             rel_error_tol (float): Defines the maximum fraction of failed expectations, overrides error_tol.
             minimum_coverage (float): Minimum expected fraction of DataFrame columns covered by the expectation suite.
         """
-        self._warnings = list() # reset the warnings to avoid duplicates
         df = df if isinstance(df, pd.DataFrame) else None
         results = {}
         if df is not None:
             try: # if anything fails
                 results['Coverage Fraction'] = self._coverage_fraction(
                     results_json_path, df, minimum_coverage=minimum_coverage)
             except AssertionError as exc: # print a Warning and log the message
-                print("['DATA EXPECTATIONS'] Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
+                self._logger.critical("Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
                 return "[ERROR] Canceled computation. Original exception: "+f"{exc}"
         else:
-            print("A valid DataFrame was not passed, skipping coverage fraction test.")
+            self._logger.error("A valid DataFrame was not passed, skipping coverage fraction test.")
         results['Overall Assessment'] = self._overall_assessment(results_json_path, error_tol, rel_error_tol)
         results['Expectation Level Assessment'] = self._expectation_level_assessment(results_json_path)
         return results
diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py
@@ -9,13 +9,16 @@
 from ydata_quality.core import QualityEngine, QualityWarning
 from ydata_quality.utils.correlations import correlation_matrix, partial_correlation_matrix, correlation_plotter, vif_collinearity, chi2_collinearity
 from ydata_quality.utils.auxiliary import infer_dtypes, standard_normalize
+from ydata_quality.utils.logger import get_logger, NAME
 
 class DataRelationsDetector(QualityEngine):
     """Main class to run data relations analysis.
     """
 
-    def __init__(self):
-        return  # Override the base class init method
+    def __init__(self, severity: Optional[str] = None):  # Overrides base class init
+        "severity (str, optional): Sets the logger warning threshold to one of the valid levels [DEBUG, INFO, WARNING, ERROR, CRITICAL]"
+        self._warnings = [] # reset the warnings to avoid duplicates
+        self._logger = get_logger(NAME, level=severity)
 
     @property
     def tests(self):
@@ -29,12 +32,16 @@ def dtypes(self):
     def dtypes(self, df_dtypes: Tuple[pd.DataFrame, dict]):
         df, dtypes = df_dtypes
         if not isinstance(dtypes, dict):
-            raise ValueError("Property 'dtypes' should be a dictionary.")
-        assert all(col in df.columns for col in dtypes), "All dtypes keys \
-            must be columns in the dataset."
+            self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
+            dtypes = {}
+        cols_not_in_df = [col for col in dtypes if col not in df.columns]
+        if len(cols_not_in_df) > 0:
+            self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
         supported_dtypes = ['numerical', 'categorical']
-        assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes\
-             must be in the supported broad dtype list: {}.".format(supported_dtypes)
+        wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
+        if len(wrong_dtypes)>0:
+            self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
+        dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
         df_col_set = set(df.columns)
         dtypes_col_set = set(dtypes.keys())
         missing_cols = df_col_set.difference(dtypes_col_set)
@@ -60,9 +67,6 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
             plot (bool): Pass True to produce all available graphical outputs, False to suppress all graphical output.
         """
         assert label in df.columns or not label, "The provided label name does not exist as a column in the dataset"
-        self._warnings = [] # reset the warnings to avoid duplicates
-        if not dtypes:
-            dtypes = {}
         self.dtypes = (df, dtypes)  # Consider refactoring QualityEngine dtypes (df as argument of setter)
         df = standard_normalize(df, dtypes)
         results = {}
@@ -77,7 +81,7 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
             results['Confounders'] = self._confounder_detection(corr_mat, p_corr_mat, corr_th)
             results['Colliders'] = self._collider_detection(corr_mat, p_corr_mat, corr_th)
         else:
-            print('[DATA RELATIONS] The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
+            self._logger.warning('The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
         if label:
             results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
         results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th)