Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(DataQuality): Reproducible randomness in all engines #16

Merged
merged 5 commits into from
Sep 9, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/ydata_quality/bias_fairness/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@ class BiasFairness(QualityEngine):
- Performance Discrimination: checks for performance disparities on sensitive attributes
"""

def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None):
def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None,
random_state: Optional[int] = None):
"""
Args
df (pd.DataFrame): reference DataFrame used to run the analysis
sensitive_features (List[str]): features deemed as sensitive attributes
label (str, optional): target feature to be predicted
"""
super().__init__(df=df, label=label)
super().__init__(df=df, label=label, random_state=random_state)
self._sensitive_features = sensitive_features
self._tests = ["performance_discrimination", "proxy_identification",
"sensitive_predictability", "sensitive_representativity"]
Expand Down
24 changes: 21 additions & 3 deletions src/ydata_quality/core/data_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class DataQuality:
def __init__(self,
df: pd.DataFrame,
label: str = None,
random_state: Optional[int] = None,
entities: List[Union[str, List[str]]] = [],
vmv_extensions: Optional[list]=[],
sample: Optional[pd.DataFrame] = None,
Expand All @@ -37,23 +38,26 @@ def __init__(self,
df (pd.DataFrame): reference DataFrame used to run the DataQuality analysis.
label (str, optional): [MISSINGS, LABELLING, DRIFT ANALYSIS] target feature to be predicted.
If not specified, LABELLING is skipped.
random_state (int, optional): Integer seed for random reproducibility. Default is None.
Set to None for fully random behaviour, no reproducibility.
entities: [DUPLICATES] entities relevant for duplicate analysis.
vmv_extensions: [VALUED MISSING VALUES] A list of user provided valued missing values to append to defaults.
sample: [DRIFT ANALYSIS] data against which drift is tested.
model: [DRIFT ANALYSIS] model wrapped by ModelWrapper used to test concept drift.
"""
self.df = df
self._warnings = list()
self._random_state = random_state
self._engines = { # Default list of engines
'duplicates': DuplicateChecker(df=df, entities=entities),
'missings': MissingsProfiler(df=df, target=label),
'missings': MissingsProfiler(df=df, target=label, random_state=self.random_state),
'valued-missing-values': VMVIdentifier(df=df, vmv_extensions=vmv_extensions),
'drift-analysis': DriftAnalyser(ref=df, sample=sample, label=label, model=model)
'drift-analysis': DriftAnalyser(ref=df, sample=sample, label=label, model=model, random_state=self.random_state)
}

# Engines based on mandatory arguments
if label is not None:
self._engines['labelling'] = LabelInspector(df=df, label=label)
self._engines['labelling'] = LabelInspector(df=df, label=label, random_state=self.random_state)
else:
print('Label is not defined. Skipping LABELLING engine.')

Expand All @@ -78,6 +82,20 @@ def engines(self):
"Dictionary of instantiated engines to run data quality analysis."
return self._engines

@property
def random_state(self):
"Random state passed to individual engines on evaluate."
return self._random_state

@random_state.setter
def random_state(self, new_state):
"Sets new state to random state."
if new_state==None or (isinstance(new_state, int) and new_state>=0):
self._random_state = new_state
else:
print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None (no reproducibility).')
self._random_state = None

def __store_warnings(self):
"Appends all warnings from individiual engines into warnings of DataQuality main class."
for engine in self.engines.values():
Expand Down
19 changes: 18 additions & 1 deletion src/ydata_quality/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Optional

import pandas as pd
from numpy import random

from ydata_quality.core.warnings import Priority, QualityWarning
from ydata_quality.utils.modelling import infer_dtypes
Expand All @@ -14,12 +15,13 @@
class QualityEngine(ABC):
"Main class for running and storing data quality analysis."

def __init__(self, df: pd.DataFrame, label: str = None, dtypes: dict = None):
def __init__(self, df: pd.DataFrame, random_state: Optional[int] = None, label: str = None, dtypes: dict = None):
self._df = df
self._warnings = list()
self._tests = []
self._label = label
self._dtypes = dtypes
self._random_state = random_state

@property
def df(self):
Expand Down Expand Up @@ -62,6 +64,21 @@ def dtypes(self, dtypes: dict):
dtypes[col] = dtype
self._dtypes = dtypes

@property
def random_state(self):
"Last set random state."
return self._random_state

@random_state.setter
def random_state(self, new_state):
"Sets new state to random state."
try:
self._random_state = new_state
random.seed(self.random_state)
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
except:
print('An invalid random state was passed. Acceptable values are integers >= 0 or None. Setting to None.')
self._random_state = None

def __clean_warnings(self):
"""Deduplicates and sorts the list of warnings."""
self._warnings = sorted(list(set(self._warnings))) # Sort unique warnings by priority
Expand Down
14 changes: 7 additions & 7 deletions src/ydata_quality/drift/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
from scipy.stats import ks_2samp
from scipy.stats._continuous_distns import chi2_gen
Expand Down Expand Up @@ -69,7 +70,7 @@ class DriftAnalyser(QualityEngine):

def __init__(self, ref: pd.DataFrame, sample: Optional[pd.DataFrame] = None,
label: Optional[str] = None, model: Callable = None, holdout: float = 0.2,
random_state: Optional[int] = 0):
random_state: Optional[int] = None):
"""
Initializes the engine properties and lists tests for automated evaluation.
Args:
Expand All @@ -84,12 +85,11 @@ def __init__(self, ref: pd.DataFrame, sample: Optional[pd.DataFrame] = None,
random_state (Optional, int): Seed used to guarantee reproducibility of the random sample splits.
Pass None for no reproducibility.
"""
super().__init__(df=ref, label=label)
super().__init__(df=ref, label=label, random_state=random_state)
self.sample = sample
self._model = model
self.has_model = None
self._random_state = random_state
self._holdout, self._remaining_data = random_split(ref, holdout, random_state=self._random_state)
self._holdout, self._remaining_data = random_split(ref, holdout, random_state=self.random_state)
self._tests = ['ref_covariate_drift', 'ref_label_drift', 'sample_covariate_drift',
'sample_label_drift', 'sample_concept_drift']

Expand Down Expand Up @@ -194,7 +194,7 @@ def ref_covariate_drift(self, p_thresh: float= 0.05) -> pd.DataFrame:
bonferroni_p = p_thresh/len(covariates.columns) # Bonferroni correction
all_p_vals = pd.DataFrame(index=perc_index, columns=covariates.columns)
for idx, fraction in enumerate(leftover_fractions):
downsample, _ = random_split(covariates, fraction, random_state=self._random_state)
downsample, _ = random_split(covariates, fraction, random_state=self.random_state)
p_vals = []
for column in covariates.columns:
_, p_val, _ = self._2sample_feat_good_fit(ref_sample = holdout[column],
Expand Down Expand Up @@ -223,10 +223,10 @@ def ref_label_drift(self, p_thresh: float= 0.05):
labels = self._remaining_data[self.label].copy()
holdout = self._holdout[self.label]
leftover_fractions = np.arange(0.2, 1.2, 0.2)
p_values = pd.DataFrame(index=["{0:.0%}".format(fraction) for fraction in leftover_fractions],
p_values = pd.DataFrame(index=["{:.0%}".format(fraction) for fraction in leftover_fractions],
columns=['Label p-value', 'p-value threshold'])
for idx, fraction in enumerate(leftover_fractions):
downsample, _ = random_split(labels, fraction, random_state=self._random_state)
downsample, _ = random_split(labels, fraction, random_state=self.random_state)
_, p_val, test_name = self._2sample_feat_good_fit(ref_sample = holdout,
test_sample = downsample)
p_values['Label p-value'].iloc[idx] = p_val
Expand Down
3 changes: 1 addition & 2 deletions src/ydata_quality/duplicates/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ class DuplicateChecker(QualityEngine):
"Engine for running analyis on duplicate records."

def __init__(self, df: pd.DataFrame, entities: List[Union[str, List[str]]] = []):
self._df = df
super().__init__(df=df)
self._entities = entities
self._warnings = set()
self._tests = ["exact_duplicates", "entity_duplicates", "duplicate_columns"]

@property
Expand Down
20 changes: 10 additions & 10 deletions src/ydata_quality/labelling/engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Implementation of LabelInspector engine class to run label quality analysis.
"""
from typing import Union
from typing import Union, Optional

import pandas as pd

Expand All @@ -13,20 +13,20 @@
standard_transform)


def LabelInspector(df, label):
def LabelInspector(df, label, random_state: Optional[int]=None):
"""Instantiate this label inspector class.
Runs a label type inference to instantiate the correct label inspector."""
label_dtype = infer_dtypes(df[label])[label] # Label column dtype inferral
if label_dtype == 'categorical':
return CategoricalLabelInspector(df, label)
return CategoricalLabelInspector(df, label, random_state=None)
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
else:
return NumericalLabelInspector(df, label)
return NumericalLabelInspector(df, label, random_state=None)

class SharedLabelInspector(QualityEngine):
"""Shared structure for Numerical/Categorical Label Inspector"""

def __init__(self, df: pd.DataFrame, label: str):
super().__init__(df=df, label=label)
def __init__(self, df: pd.DataFrame, label: str, random_state=None):
super().__init__(df=df, label=label, random_state=random_state)
self._tdf = None

@property
Expand Down Expand Up @@ -75,8 +75,8 @@ class CategoricalLabelInspector(SharedLabelInspector):
"""Engine for running analysis on categorical labels.
Ordinal labels can be handled if passed as categorical."""

def __init__(self, df: pd.DataFrame, label: str):
super().__init__(df=df, label=label)
def __init__(self, df: pd.DataFrame, label: str, random_state: Optional[int]):
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
super().__init__(df=df, label=label, random_state=random_state)
self._centroids = None
self._tests = ["missing_labels", "few_labels", "unbalanced_classes",
"one_vs_rest_performance", "outlier_detection"]
Expand Down Expand Up @@ -240,8 +240,8 @@ def outlier_detection(self, th=3):
class NumericalLabelInspector(SharedLabelInspector):
"Engine for running analyis on numerical labels."

def __init__(self, df: pd.DataFrame, label: str):
super().__init__(df, label)
def __init__(self, df: pd.DataFrame, label: str, random_state):
super().__init__(df=df, label=label, random_state=random_state)
self._tests = ["missing_labels", "test_normality", "outlier_detection"]

def _GMM_clusters(self, max_clusters):
Expand Down
4 changes: 2 additions & 2 deletions src/ydata_quality/missings/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
class MissingsProfiler(QualityEngine):
"Main class to run missing value analysis."

def __init__(self, df: pd.DataFrame, target: Optional[str] = None):
def __init__(self, df: pd.DataFrame, target: Optional[str] = None, random_state: Optional[int]=None):
"""
Args:
df (pd.DataFrame): reference DataFrame used to run the missing value analysis.
target (str, optional): target
"""
#TODO: Rename 'target' argument to 'label' standard of QualityEngine
super().__init__(df=df)
super().__init__(df=df, random_state=random_state)
self._target = target
self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"]

Expand Down
7 changes: 3 additions & 4 deletions src/ydata_quality/utils/auxiliary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,18 @@ def test_load_json_path(json_path: str) -> dict:
raise IOError("Expected a path to a json file.")
return json_dict

def random_split(df: Union[pd.DataFrame, pd.Series], split_size: float, shuffle=True,
random_state: int=None) -> Tuple[pd.DataFrame]:
def random_split(df: Union[pd.DataFrame, pd.Series], split_size: float, shuffle: bool=True, random_state: int=None) -> Tuple[pd.DataFrame]:
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
"""Shuffles a DataFrame and splits it into 2 partitions according to split_size.
Returns a tuple with the split first (partition corresponding to split_size, and remaining second).
Args:
df (pd.DataFrame): A DataFrame to be split
split_size (float): Fraction of the sample to be taken
shuffle (bool): If True shuffles sample rows before splitting
random_state (int): If an int is passed, the random process is reproducible using the provided seed"""
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
assert random_state is None or (isinstance(random_state, int) and random_state>=0), 'The random seed must be a non-negative integer or None.'
assert 0<= split_size <=1, 'split_size must be a fraction, i.e. a float in the [0,1] interval.'
assert random_state is None or isinstance(random_state, int), 'The random seed must be an integer or None.'
jfsantos-ds marked this conversation as resolved.
Show resolved Hide resolved
if shuffle: # Shuffle dataset rows
sample = df.sample(frac=1, random_state=random_state) # An int random_state ensures reproducibility
sample = df.sample(frac=1, random_state=random_state)
split_len = int(sample.shape[0]*split_size)
split = sample.iloc[:split_len]
remainder = sample.iloc[split_len:]
Expand Down
Loading