ydataai · jfsantos-ds · Sep 22, 2021 · Sep 21, 2021 · Sep 21, 2021 · Sep 21, 2021
diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py
@@ -93,8 +93,7 @@ def performance_discrimination(self):
         """
         # TODO: support error rate parity metrics (e.g. false positive rate, positive rate)
         if self.label is None:
-            print('[BIAS&FAIRNESS] Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
-            pass
+            self._logger.warning('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
 
         res = {}
         for feat in self.sensitive_features:

diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py
@@ -15,6 +15,7 @@
 from ydata_quality.data_expectations import DataExpectationsReporter
 from ydata_quality.bias_fairness import BiasFairness
 from ydata_quality.data_relations import DataRelationsDetector
+from ydata_quality.utils.logger import *
 
 class DataQuality:
     "DataQuality contains the multiple data quality engines."
@@ -55,7 +56,7 @@ def __init__(self,
             label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted.
                                     If not specified, LABELLING is skipped.
             random_state (int, optional): Integer seed for random reproducibility. Default is None.
-                Set to None for fully random behaviour, no reproducibility.
+                Set to None for fully random behavior, no reproducibility.
             entities: [DUPLICATES] entities relevant for duplicate analysis.
             is_close: [DUPLICATES] Pass True to use numpy.isclose instead of pandas.equals in column comparison.
             ed_extensions: [ERRONEOUS DATA] A list of user provided erroneous data values to append to defaults.
@@ -75,10 +76,12 @@ def __init__(self,
         #TODO: Refactor legacy engines (property based) and logic in this class to new base (lean objects)
         self.df = df
         self._warnings = list()
+        self._logger = create_logger(NAME, STREAM, LOG_LEVEL)
         self._random_state = random_state
+
         self._engines_legacy = { # Default list of engines
             'duplicates': DuplicateChecker(df=df, entities=entities, is_close=is_close),
-            'missings': MissingsProfiler(df=df, target=label, random_state=self.random_state),
+            'missings': MissingsProfiler(df=df, label=label, random_state=self.random_state),
             'erroneous-data': ErroneousDataIdentifier(df=df, ed_extensions=ed_extensions),
             'drift': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state)
         }
@@ -96,16 +99,16 @@ def __init__(self,
         if label is not None:
             self._engines_legacy['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state)
         else:
-            print('Label is not defined. Skipping LABELLING engine.')
+            self._logger.warning('Label is not defined. Skipping LABELLING engine.')
         if len(sensitive_features)>0:
             self._engines_legacy['bias&fairness'] = BiasFairness(df=df, sensitive_features=sensitive_features,
                                                                  label=label, random_state=self.random_state)
         else:
-            print('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
+            self._logger.warning('Sensitive features not defined. Skipping BIAS & FAIRNESS engine.')
         if results_json_path is not None:
             self._engines_new['expectations'] = DataExpectationsReporter()
         else:
-            print('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')
+            self._logger.warning('The path to a Great Expectations results json is not defined. Skipping EXPECTATIONS engine.')
 
 
     def __clean_warnings(self):
@@ -140,7 +143,7 @@ def random_state(self, new_state):
         if new_state==None or (isinstance(new_state, int) and new_state>=0):
             self._random_state = new_state
         else:
-            print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
+            self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
             self._random_state = None
 
     def __store_warnings(self):
@@ -158,7 +161,7 @@ def report(self):
         self.__store_warnings() # fetch all warnings from the engines
         self.__clean_warnings()
         if not self._warnings:
-            print('No warnings found.')
+            self._logger.info('No warnings found.')
         else:
             prio_counts = Counter([warn.priority.value for warn in self._warnings])
             print('Warnings count by priority:')

diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py
@@ -11,6 +11,7 @@
 from ydata_quality.core.warnings import Priority, QualityWarning
 from ydata_quality.utils.auxiliary import infer_df_type, infer_dtypes
 from ydata_quality.utils.enum import DataFrameType
+from ydata_quality.utils.logger import *
 
 
 class QualityEngine(ABC):
@@ -20,6 +21,7 @@ def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label:
         self._df = df
         self._df_type = None
         self._warnings = list()
+        self._logger = create_logger(NAME, STREAM, LOG_LEVEL)
         self._tests = []
         self._label = label
         self._dtypes = dtypes
@@ -37,9 +39,8 @@ def label(self):
 
     @label.setter
     def label(self, label: str):
-        if not isinstance(label, str):
-            raise ValueError("Property 'label' should be a string.")
-        assert label in self.df.columns, "Given label should exist as a DataFrame column."
+        assert isinstance(label, str), "Property 'label' should be a string."
+        assert label in self.df.columns, "Provided label %s does not exist as a DataFrame column." % label
         self._label = label
 
     @property
@@ -52,11 +53,16 @@ def dtypes(self):
     @dtypes.setter
     def dtypes(self, dtypes: dict):
         if not isinstance(dtypes, dict):
-            raise ValueError("Property 'dtypes' should be a dictionary.")
-        assert all(col in self.df.columns for col in dtypes), "All dtypes keys must be columns in the dataset."
+            self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
+            dtypes = {}
+        cols_not_in_df = [col for col in dtypes if col not in self.df.columns]
+        if len(cols_not_in_df) > 0:
+            self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
         supported_dtypes = ['numerical', 'categorical']
-        assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes must be in the supported \
-broad dtype list: {}.".format(supported_dtypes)
+        wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
+        if len(wrong_dtypes>0):
+            self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
+        dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
         df_col_set = set(self.df.columns)
         dtypes_col_set = set(dtypes.keys())
         missing_cols = df_col_set.difference(dtypes_col_set)
@@ -85,7 +91,7 @@ def random_state(self, new_state):
             self._random_state = new_state
             random.seed(self.random_state)
         except:
-            print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
+            self._logger.warning('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
             self._random_state = None
 
     def __clean_warnings(self):
@@ -116,7 +122,7 @@ def report(self):
         "Prints a report containing all the warnings detected during the data quality analysis."
         self.__clean_warnings()
         if not self._warnings:
-            print('No warnings found.')
+            self._logger.info('No warnings found.')
         else:
             prio_counts = Counter([warn.priority.value for warn in self._warnings])
             print('Warnings count by priority:')
@@ -133,6 +139,6 @@ def evaluate(self):
             try: # if anything fails
                 results[test] = getattr(self, test)()
             except Exception as exc: # print a Warning and log the message
-                print(f'WARNING: Skipping test {test} due to failure during computation.')
+                self._logger.warning('Skipping test due to failure during computation. See results folder of this test for further details.')
                 results[test] = "[ERROR] Test failed to compute. Original exception: "+f"{exc}"
         return results
diff --git a/src/ydata_quality/data_expectations/engine.py b/src/ydata_quality/data_expectations/engine.py
@@ -183,10 +183,10 @@ def evaluate(self, results_json_path: str, df: pd.DataFrame = None, error_tol: i
                 results['Coverage Fraction'] = self._coverage_fraction(
                     results_json_path, df, minimum_coverage=minimum_coverage)
             except AssertionError as exc: # print a Warning and log the message
-                print("['DATA EXPECTATIONS'] Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
+                self._logger.critical("Canceled Data Expectations engine execution due to dataset-expectation suite mismatch.")
                 return "[ERROR] Canceled computation. Original exception: "+f"{exc}"
         else:
-            print("A valid DataFrame was not passed, skipping coverage fraction test.")
+            self._logger.error("A valid DataFrame was not passed, skipping coverage fraction test.")
         results['Overall Assessment'] = self._overall_assessment(results_json_path, error_tol, rel_error_tol)
         results['Expectation Level Assessment'] = self._expectation_level_assessment(results_json_path)
         return results
diff --git a/src/ydata_quality/data_relations/engine.py b/src/ydata_quality/data_relations/engine.py
@@ -29,12 +29,16 @@ def dtypes(self):
     def dtypes(self, df_dtypes: Tuple[pd.DataFrame, dict]):
         df, dtypes = df_dtypes
         if not isinstance(dtypes, dict):
-            raise ValueError("Property 'dtypes' should be a dictionary.")
-        assert all(col in df.columns for col in dtypes), "All dtypes keys \
-            must be columns in the dataset."
+            self._logger.warning("Property 'dtypes' should be a dictionary. Defaulting to all column dtypes inference.")
+            dtypes = {}
+        cols_not_in_df = [col for col in dtypes if col not in df.columns]
+        if len(cols_not_in_df) > 0:
+            self._logger.warning("Passed dtypes keys %s are not columns of the provided dataset.", cols_not_in_df)
         supported_dtypes = ['numerical', 'categorical']
-        assert all(dtype in supported_dtypes for dtype in dtypes.values()), "Assigned dtypes\
-             must be in the supported broad dtype list: {}.".format(supported_dtypes)
+        wrong_dtypes = [col for col, dtype in dtypes.items() if dtype not in supported_dtypes]
+        if len(wrong_dtypes)>0:
+            self._logger.warning("Columns %s of dtypes where not defined with a supported dtype and will be inferred.", wrong_dtypes)
+        dtypes = {key:val for key, val in dtypes.items() if key not in cols_not_in_df+wrong_dtypes}
         df_col_set = set(df.columns)
         dtypes_col_set = set(dtypes.keys())
         missing_cols = df_col_set.difference(dtypes_col_set)
@@ -77,7 +81,7 @@ def evaluate(self, df: pd.DataFrame, dtypes: Optional[dict] = None, label: str=N
             results['Confounders'] = self._confounder_detection(corr_mat, p_corr_mat, corr_th)
             results['Colliders'] = self._collider_detection(corr_mat, p_corr_mat, corr_th)
         else:
-            print('[DATA RELATIONS] The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
+            self._logger.warning('The partial correlation matrix is not computable for this dataset. Skipping potential confounder and collider detection tests.')
         if label:
             results['Feature Importance'] = self._feature_importance(corr_mat, p_corr_mat, label, corr_th)
         results['High Collinearity'] = self._high_collinearity_detection(df, self.dtypes, label, vif_th, p_th=p_th)

diff --git a/src/ydata_quality/drift/engine.py b/src/ydata_quality/drift/engine.py
@@ -223,7 +223,7 @@ def ref_label_drift(self, p_thresh: float= 0.05):
         Args:
             p_thresh (float): The p_threshold used for the test."""
         if self.label is None:
-            print("[REFERENCE LABEL DRIFT] No label was provided. Test skipped.")
+            self._logger.warning("No label was provided. Test skipped.")
             return
         labels = self._remaining_data[self.label].copy()
         holdout = self._holdout[self.label]
@@ -280,7 +280,7 @@ def sample_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
                     description=f"""There were {n_invalid_tests} invalid tests found. This is likely due to a small test sample size. The data summary should be analyzed before considering the test conclusive."""
             ))
         else:
-            print("[SAMPLE COVARIATE DRIFT] Covariate drift was not detected in the test sample.")
+            self._logger.info("Covariate drift was not detected in the test sample.")
         return test_summary
 
     def sample_label_drift(self, p_thresh: float= 0.05) -> pd.Series:
@@ -312,7 +312,7 @@ def sample_label_drift(self, p_thresh: float= 0.05) -> pd.Series:
                     description="The test was invalid. This is likely due to a small test sample size."
             ))
         else:
-            print("[SAMPLE LABEL DRIFT] Label drift was not detected in the test sample.")
+            self._logger.info("Label drift was not detected in the test sample.")
         return test_summary
 
     def sample_concept_drift(self, p_thresh: float= 0.05) -> pd.Series:
@@ -349,5 +349,5 @@ def sample_concept_drift(self, p_thresh: float= 0.05) -> pd.Series:
                     description="The test was invalid. This is likely due to a small test sample size."
             ))
         else:
-            print("[CONCEPT DRIFT] Concept drift was not detected between the reference and the test samples.")
+            self._logger.info("Concept drift was not detected between the reference and the test samples.")
         return test_summary
diff --git a/src/ydata_quality/duplicates/engine.py b/src/ydata_quality/duplicates/engine.py
@@ -14,6 +14,10 @@ class DuplicateChecker(QualityEngine):
     "Engine for running analyis on duplicate records."
 
     def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = [], is_close: bool=False):
+        """
+            df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis.
+            entities (List[Union[str, List[str]]]): entities relevant for duplicate analysis. Passing lists allows composed entities of multiple columns.
+            is_close (bool): Pass True to use numpy.isclose instead of pandas.equals in column comparison."""
         super().__init__(df=df)
         self._entities = entities
         self._tests = ["exact_duplicates", "entity_duplicates", "duplicate_columns"]
@@ -58,7 +62,7 @@ def exact_duplicates(self):
                     description=f"Found {len(dups)} instances with exact duplicate feature values."
             ))
         else:
-            print("[EXACT DUPLICATES] No exact duplicates were found.")
+            self._logger.info("No exact duplicates were found.")
             dups = None
         return dups
 
@@ -92,7 +96,7 @@ def entity_duplicates(self, entity: Optional[Union[str, List[str]]] = None):
                     ent_dups.setdefault(entity_key, {})[val] = dups[(dups[entity].values==val).all(axis=1)]
         else: # if entity is not specified
             if len(self.entities) == 0:
-                print("[ENTITY DUPLICATES] There are no entities defined to run the analysis. Skipping the test.")
+                self._logger.info("There are no entities defined to run the analysis. Skipping the test.")
                 return None
             else:
                 for col in self.entities:
@@ -111,6 +115,6 @@ def duplicate_columns(self):
                 )
             )
         else:
-            print("[DUPLICATE COLUMNS] No duplicate columns were found.")
+            self._logger.info("No duplicate columns were found.")
             dups = None
         return dups
diff --git a/src/ydata_quality/erroneous_data/engine.py b/src/ydata_quality/erroneous_data/engine.py
@@ -83,7 +83,7 @@ def flatlines(self, th: int=5, skip: list=[]):
             skip: List of columns that will not be target of search for flatlines.
                 Pass '__index' inside skip list to skip looking for flatlines at the index."""
         if self.df_type == DataFrameType.TABULAR:
-            print('[FLATLINES] The provided DataFrame is not a valid Timeseries type, skipping this test.')
+            self._logger.info('The provided DataFrame is not a valid Timeseries type, skipping this test.')
             return None
         flatlines = {}
         for column in self.df.columns:  # Compile flatline index
@@ -101,7 +101,7 @@ def flatlines(self, th: int=5, skip: list=[]):
             ))
             return flatlines
         else:
-            print("[FLATLINES] No flatline events with a minimum length of {} were found.".format(th))
+            self._logger.info("No flatline events with a minimum length of %f were found.", th)
 
     def predefined_erroneous_data(self, skip: list=[], short: bool = True):
         """Runs a check against a list of predefined erroneous data values.
@@ -127,9 +127,7 @@ def predefined_erroneous_data(self, skip: list=[], short: bool = True):
             eds.drop(no_ed_cols, axis=1, inplace=True)
             eds.drop(no_ed_rows, inplace=True)
         if eds.empty:
-            print("[PREDEFINED ERRONEOUS DATA] No predefined ED values from  the set {} were found in the dataset.".format(
-                self.err_data
-            ))
+            self._logger.info("No predefined ED values from  the set %s were found in the dataset.", self.err_data)
         else:
             total_eds = eds.sum().sum()
             self.store_warning(