ydataai · jfsantos-ds · Sep 9, 2021 · Sep 8, 2021 · Sep 8, 2021 · Sep 9, 2021
diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py
@@ -21,14 +21,15 @@ class BiasFairness(QualityEngine):
         - Performance Discrimination: checks for performance disparities on sensitive attributes
     """
 
-    def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None):
+    def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None,
+        random_state: Optional[int] = None):
         """
         Args
             df (pd.DataFrame): reference DataFrame used to run the analysis
             sensitive_features (List[str]): features deemed as sensitive attributes
             label (str, optional): target feature to be predicted
         """
-        super().__init__(df=df, label=label)
+        super().__init__(df=df, label=label, random_state=random_state)
         self._sensitive_features = sensitive_features
         self._tests = ["performance_discrimination", "proxy_identification",
                         "sensitive_predictability", "sensitive_representativity"]

diff --git a/src/ydata_quality/core/data_quality.py b/src/ydata_quality/core/data_quality.py
@@ -20,6 +20,7 @@ class DataQuality:
     def __init__(self,
                     df: pd.DataFrame,
                     label: str = None,
+                    random_state: Optional[int]  = None,
                     entities: List[Union[str, List[str]]] = [],
                     vmv_extensions: Optional[list]=[],
                     sample: Optional[pd.DataFrame] = None,
@@ -37,23 +38,26 @@ def __init__(self,
             df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis.
             label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted.
                                     If not specified, LABELLING is skipped.
+            random_state (int, optional): Integer seed for random reproducibility. Default is None.
+                Set to None for fully random behaviour, no reproducibility.
             entities: [DUPLICATES] entities relevant for duplicate analysis.
             vmv_extensions: [VALUED MISSING VALUES] A list of user provided valued missing values to append to defaults.
             sample: [DRIFT ANALYSIS] data against which drift is tested.
             model: [DRIFT ANALYSIS] model wrapped by ModelWrapper used to test concept drift.
         """
         self.df = df
         self._warnings = list()
+        self._random_state = random_state
         self._engines = { # Default list of engines
             'duplicates': DuplicateChecker(df=df, entities=entities),
-            'missings': MissingsProfiler(df=df, target=label),
+            'missings': MissingsProfiler(df=df, target=label, random_state=self.random_state),
             'valued-missing-values': VMVIdentifier(df=df, vmv_extensions=vmv_extensions),
-            'drift-analysis': DriftAnalyser(ref=df, sample=sample, label=label, model=model)
+            'drift-analysis': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state)
         }
 
         # Engines based on mandatory arguments
         if label is not None:
-            self._engines['labelling'] = LabelInspector(df=df, label=label)
+            self._engines['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state)
         else:
             print('Label is not defined. Skipping LABELLING engine.')
 
@@ -78,6 +82,20 @@ def engines(self):
         "Dictionary of instantiated engines to run data quality analysis."
         return self._engines
 
+    @property
+    def random_state(self):
+        "Random state passed to individual engines on evaluate."
+        return self._random_state
+
+    @random_state.setter
+    def random_state(self, new_state):
+        "Sets new state to random state."
+        if new_state==None or (isinstance(new_state, int) and new_state>=0):
+            self._random_state = new_state
+        else:
+            print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
+            self._random_state = None
+
     def __store_warnings(self):
         "Appends all warnings from individiual engines into warnings of DataQuality main class."
         for engine in self.engines.values():

diff --git a/src/ydata_quality/core/engine.py b/src/ydata_quality/core/engine.py
@@ -6,6 +6,7 @@
 from typing import Optional
 
 import pandas as pd
+from numpy import random
 
 from ydata_quality.core.warnings import Priority, QualityWarning
 from ydata_quality.utils.modelling import infer_dtypes
@@ -14,12 +15,13 @@
 class QualityEngine(ABC):
     "Main class for running and storing data quality analysis."
 
-    def __init__(self, df: pd.DataFrame, label: str = None, dtypes: dict = None):
+    def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None):
         self._df = df
         self._warnings = list()
         self._tests = []
         self._label = label
         self._dtypes = dtypes
+        self._random_state = random_state
 
     @property
     def df(self):
@@ -62,6 +64,21 @@ def dtypes(self, dtypes: dict):
                 dtypes[col] = dtype
         self._dtypes = dtypes
 
+    @property
+    def random_state(self):
+        "Last set random state."
+        return self._random_state
+
+    @random_state.setter
+    def random_state(self, new_state):
+        "Sets new state to random state."
+        try:
+            self._random_state = new_state
+            random.seed(self.random_state)
+        except:
+            print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
+            self._random_state = None
+
     def __clean_warnings(self):
         """Deduplicates and sorts the list of warnings."""
         self._warnings = sorted(list(set(self._warnings))) # Sort unique warnings by priority

diff --git a/src/ydata_quality/drift/engine.py b/src/ydata_quality/drift/engine.py
@@ -5,6 +5,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+
 import pandas as pd
 from scipy.stats import ks_2samp
 from scipy.stats._continuous_distns import chi2_gen
@@ -69,7 +70,7 @@ class DriftAnalyser(QualityEngine):
 
     def __init__(self, ref: pd.DataFrame, sample: Optional[pd.DataFrame] = None,
         label: Optional[str] = None, model: Callable = None, holdout: float = 0.2,
-        random_state: Optional[int] = 0):
+        random_state: Optional[int] = None):
         """
         Initializes the engine properties and lists tests for automated evaluation.
         Args:
@@ -84,12 +85,11 @@ def __init__(self, ref: pd.DataFrame, sample: Optional[pd.DataFrame] = None,
             random_state (Optional, int): Seed used to guarantee reproducibility of the random sample splits.
                 Pass None for no reproducibility.
         """
-        super().__init__(df=ref, label=label)
+        super().__init__(df=ref, label=label, random_state=random_state)
         self.sample = sample
         self._model = model
         self.has_model = None
-        self._random_state = random_state
-        self._holdout, self._remaining_data = random_split(ref, holdout, random_state=self._random_state)
+        self._holdout, self._remaining_data = random_split(ref, holdout, random_state=self.random_state)
         self._tests = ['ref_covariate_drift', 'ref_label_drift', 'sample_covariate_drift',
             'sample_label_drift', 'sample_concept_drift']
 
@@ -194,7 +194,7 @@ def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
         bonferroni_p = p_thresh/len(covariates.columns)  # Bonferroni correction
         all_p_vals = pd.DataFrame(index=perc_index, columns=covariates.columns)
         for idx, fraction in enumerate(leftover_fractions):
-            downsample, _ = random_split(covariates, fraction, random_state=self._random_state)
+            downsample, _ = random_split(covariates, fraction, random_state=self.random_state)
             p_vals = []
             for column in covariates.columns:
                 _, p_val, _ = self._2sample_feat_good_fit(ref_sample = holdout[column],
@@ -223,10 +223,10 @@ def ref_label_drift(self, p_thresh: float= 0.05):
         labels = self._remaining_data[self.label].copy()
         holdout = self._holdout[self.label]
         leftover_fractions = np.arange(0.2, 1.2, 0.2)
-        p_values = pd.DataFrame(index=["{0:.0%}".format(fraction) for fraction in leftover_fractions],
+        p_values = pd.DataFrame(index=["{:.0%}".format(fraction) for fraction in leftover_fractions],
             columns=['Label p-value', 'p-value threshold'])
         for idx, fraction in enumerate(leftover_fractions):
-            downsample, _ = random_split(labels, fraction, random_state=self._random_state)
+            downsample, _ = random_split(labels, fraction, random_state=self.random_state)
             _, p_val, test_name = self._2sample_feat_good_fit(ref_sample = holdout,
                 test_sample = downsample)
             p_values['Label p-value'].iloc[idx] = p_val

diff --git a/src/ydata_quality/duplicates/engine.py b/src/ydata_quality/duplicates/engine.py
@@ -13,9 +13,8 @@ class DuplicateChecker(QualityEngine):
     "Engine for running analyis on duplicate records."
 
     def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = []):
-        self._df = df
+        super().__init__(df=df)
         self._entities = entities
-        self._warnings = set()
         self._tests = ["exact_duplicates", "entity_duplicates", "duplicate_columns"]
 
     @property

diff --git a/src/ydata_quality/labelling/engine.py b/src/ydata_quality/labelling/engine.py
@@ -1,7 +1,7 @@
 """
 Implementation of LabelInspector engine class to run label quality analysis.
 """
-from typing import Union
+from typing import Union, Optional
 
 import pandas as pd
 
@@ -13,20 +13,20 @@
                                            standard_transform)
 
 
-def LabelInspector(df, label):
+def LabelInspector(df, label, random_state: Optional[int]=None):
     """Instantiate this label inspector class.
     Runs a label type inference to instantiate the correct label inspector."""
     label_dtype = infer_dtypes(df[label])[label]  # Label column dtype inferral
     if label_dtype == 'categorical':
-        return CategoricalLabelInspector(df, label)
+        return CategoricalLabelInspector(df, label, random_state=None)
     else:
-        return NumericalLabelInspector(df, label)
+        return NumericalLabelInspector(df, label, random_state=None)
 
 class SharedLabelInspector(QualityEngine):
     """Shared structure for Numerical/Categorical Label Inspector"""
 
-    def __init__(self, df: pd.DataFrame, label: str):
-        super().__init__(df=df, label=label)
+    def __init__(self, df: pd.DataFrame, label: str, random_state=None):
+        super().__init__(df=df, label=label, random_state=random_state)
         self._tdf = None
 
     @property
@@ -75,8 +75,8 @@ class CategoricalLabelInspector(SharedLabelInspector):
     """Engine for running analysis on categorical labels.
     Ordinal labels can be handled if passed as categorical."""
 
-    def __init__(self, df: pd.DataFrame, label: str):
-        super().__init__(df=df, label=label)
+    def __init__(self, df: pd.DataFrame, label: str, random_state: Optional[int]):
+        super().__init__(df=df, label=label, random_state=random_state)
         self._centroids = None
         self._tests = ["missing_labels", "few_labels", "unbalanced_classes",
         "one_vs_rest_performance", "outlier_detection"]
@@ -240,8 +240,8 @@ def outlier_detection(self, th=3):
 class NumericalLabelInspector(SharedLabelInspector):
     "Engine for running analyis on numerical labels."
 
-    def __init__(self, df: pd.DataFrame, label: str):
-        super().__init__(df, label)
+    def __init__(self, df: pd.DataFrame, label: str, random_state):
+        super().__init__(df=df, label=label, random_state=random_state)
         self._tests = ["missing_labels", "test_normality", "outlier_detection"]
 
     def _GMM_clusters(self, max_clusters):

diff --git a/src/ydata_quality/missings/engine.py b/src/ydata_quality/missings/engine.py
@@ -15,14 +15,14 @@
 class MissingsProfiler(QualityEngine):
     "Main class to run missing value analysis."
 
-    def __init__(self, df: pd.DataFrame, target: Optional[str] = None):
+    def __init__(self, df: pd.DataFrame, target: Optional[str] = None, random_state: Optional[int]=None):
         """
         Args:
             df (pd.DataFrame): reference DataFrame used to run the missing value analysis.
             target (str, optional): target
         """
         #TODO: Rename 'target' argument to 'label' standard of QualityEngine
-        super().__init__(df=df)
+        super().__init__(df=df, random_state=random_state)
         self._target = target
         self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"]
 

diff --git a/src/ydata_quality/utils/auxiliary.py b/src/ydata_quality/utils/auxiliary.py
@@ -24,19 +24,18 @@ def test_load_json_path(json_path: str) -> dict:
         raise IOError("Expected a path to a json file.")
     return json_dict
 
-def random_split(df: Union[pd.DataFrame, pd.Series], split_size: float, shuffle=True,
-                random_state: int=None) -> Tuple[pd.DataFrame]:
+def random_split(df: Union[pd.DataFrame, pd.Series], split_size: float, shuffle: bool=True, random_state: int=None) -> Tuple[pd.DataFrame]:
     """Shuffles a DataFrame and splits it into 2 partitions according to split_size.
     Returns a tuple with the split first (partition corresponding to split_size, and remaining second).
     Args:
         df (pd.DataFrame): A DataFrame to be split
         split_size (float): Fraction of the sample to be taken
         shuffle (bool): If True shuffles sample rows before splitting
         random_state (int): If an int is passed, the random process is reproducible using the provided seed"""
+    assert random_state is None or (isinstance(random_state, int) and random_state>=0), 'The random seed must be a non-negative integer or None.'
     assert 0<= split_size <=1, 'split_size must be a fraction, i.e. a float in the [0,1] interval.'
-    assert random_state is None or isinstance(random_state, int), 'The random seed must be an integer or None.'
     if shuffle:  # Shuffle dataset rows
-        sample = df.sample(frac=1, random_state=random_state)  # An int random_state ensures reproducibility
+        sample = df.sample(frac=1, random_state=random_state)
     split_len = int(sample.shape[0]*split_size)
     split = sample.iloc[:split_len]
     remainder = sample.iloc[split_len:]