feat(bias-fairness): added engine mvp (#14)

Features: - Performance Discrimination - Proxy Identification - Sensitive Predictability - Sensitive Representativity - (utils) PredictionTask enum - (modelling) added adjusted performance
ydataai · Sep 8, 2021 · 56b7340 · 56b7340
1 parent ffac9f2
commit 56b7340
Show file tree

Hide file tree

Showing 9 changed files with 10,811 additions and 21 deletions.
diff --git a/examples/census/census_10k.csv b/examples/census/census_10k.csv
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ pandas==1.2.*
 pydantic==1.8.2
 scikit-learn==0.24.2
 matplotlib==3.4.2
+dython==0.6.7
diff --git a/src/ydata_quality/bias_fairness/__init__.py b/src/ydata_quality/bias_fairness/__init__.py
@@ -0,0 +1,8 @@
+"""
+Tools to check dataset for bias and fairness.
+"""
+from ydata_quality.bias_fairness.engine import BiasFairness
+
+__all__ = [
+    "BiasFairness"
+]
diff --git a/src/ydata_quality/bias_fairness/engine.py b/src/ydata_quality/bias_fairness/engine.py
@@ -0,0 +1,124 @@
+"""
+Implementation of BiasFairness engine to run bias and fairness analysis.
+"""
+
+from typing import List, Optional
+
+import pandas as pd
+from dython.nominal import compute_associations
+from ydata_quality.core import QualityEngine, QualityWarning
+from ydata_quality.utils.correlations import filter_associations
+from ydata_quality.utils.modelling import (baseline_performance,
+                                           performance_per_feature_values)
+
+
+class BiasFairness(QualityEngine):
+    """ Engine to run bias and fairness analysis.
+
+    Tests:
+        - Proxy Identification: tests for high correlation between sensitive and non-sensitive features
+        - Sensitive Predictability: trains a baseline model to predict sensitive attributes
+        - Performance Discrimination: checks for performance disparities on sensitive attributes
+    """
+
+    def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None):
+        """
+        Args
+            df (pd.DataFrame): reference DataFrame used to run the analysis
+            sensitive_features (List[str]): features deemed as sensitive attributes
+            label (str, optional): target feature to be predicted
+        """
+        super().__init__(df=df, label=label)
+        self._sensitive_features = sensitive_features
+        self._tests = ["performance_discrimination", "proxy_identification",
+                        "sensitive_predictability", "sensitive_representativity"]
+
+    @property
+    def sensitive_features(self):
+        "Returns a list of sensitive features."
+        return self._sensitive_features
+
+    def proxy_identification(self, th=0.5):
+        """Tests for non-protected features high correlation with sensitive attributes.
+
+        Non-sensitive features can serve as proxy for protected attributes, exposing the data to a possible
+        subsequent bias in the data pipeline. High association values indicate that alternative features can
+        be used in place of the original sensitive attributes.
+        """
+        # TODO: multiple thresholds per association type (num/num, num/cat, cat/cat)
+
+        # Compute association measures for sensitive features
+        corrs = compute_associations(self.df, num_num_assoc='pearson',nom_nom_assoc='cramer')
+        corrs = filter_associations(corrs, th=th, name='association', subset=self.sensitive_features)
+
+        if len(corrs) > 0:
+            self.store_warning(
+                QualityWarning(
+                    test='Proxy Identification', category='Bias&Fairness', priority=2, data=corrs,
+                    description=f"Found {len(corrs)} feature pairs of correlation "\
+                                 f"to sensitive attributes with values higher than defined threshold ({th})."
+            ))
+        return corrs
+
+
+    def sensitive_predictability(self, th=0.5, adjusted_metric=True):
+        """Trains a baseline classifier to predict sensitive attributes based on remaining features.
+
+        Good performances indicate that alternative features may be working as proxies for sensitive attributes.
+        """
+        drop_features = self.sensitive_features + [self.label] # features to remove in prediction
+
+        performances = pd.Series(index=self.sensitive_features)
+        for feat in performances.index:
+            data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target
+            performances[feat] = baseline_performance(df=data, target=feat, adjusted_metric=adjusted_metric)
+
+        high_perfs = performances[performances>th]
+        if len(high_perfs) > 0:
+            self.store_warning(
+                QualityWarning(
+                    test='Sensitive Attribute Predictability', category='Bias&Fairness', priority=3, data=high_perfs,
+                    description=f"Found {len(high_perfs)} sensitive attribute(s) with high predictability performance"\
+                    f" (greater than {th})."
+                )
+            )
+        return performances
+
+    def performance_discrimination(self):
+        """Checks for performance disparities for sensitive attributes.
+
+        Get the performance of a baseline model for each feature value of a sensitive attribute.
+        High disparities in the performance metrics indicate that the model may not be fair across sensitive attributes.
+        """
+        # TODO: support error rate parity metrics (e.g. false positive rate, positive rate)
+        if self.label is None:
+            print('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
+            pass
+
+        res = {}
+        for feat in self.sensitive_features:
+            res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, target=self.label))
+        return res
+
+
+    def sensitive_representativity(self, min_pct: float = 0.01):
+        """Checks categorical sensitive attributes minimum representativity of feature values.
+
+        Raises a warning if a feature value of a categorical sensitive attribute is not represented above a min_pct percentage.
+        """
+        # TODO: Representativity for numerical features
+        res = {}
+        categorical_sensitives = [k for (k,v) in self.dtypes.items() if (v == 'categorical') & (k in self.sensitive_features)]
+        for cat in categorical_sensitives:
+            dist = self.df[cat].value_counts(normalize=True) # normalized presence of feature values
+            res[cat] = dist # store the distribution
+            low_dist = dist[dist<min_pct] # filter for low representativity
+            if len(low_dist) > 0:
+                self.store_warning(
+                QualityWarning(
+                    test='Sensitive Attribute Representativity', category='Bias&Fairness', priority=2, data=low_dist,
+                    description=f"Found {len(low_dist)} values of '{cat}' sensitive attribute with low representativity"\
+                    f" in the dataset (below {min_pct*100:.2f}%)."
+                )
+            )
+        return res
diff --git a/src/ydata_quality/missings/engine.py b/src/ydata_quality/missings/engine.py
@@ -21,6 +21,7 @@ def __init__(self, df: pd.DataFrame, target: Optional[str] = None):
             df (pd.DataFrame): reference DataFrame used to run the missing value analysis.
             target (str, optional): target
         """
+        #TODO: Rename 'target' argument to 'label' standard of QualityEngine
         super().__init__(df=df)
         self._target = target
         self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"]
@@ -135,13 +136,13 @@ def performance_drop(self, col: Union[List[str], str, None] = None, normalize=Tr
         # Guesstimate the prediction type
         prediction_type = self.__get_prediction_type()
         results = pd.DataFrame({
-            c: performance_per_missing_value(df=self.df, feature=c, target=self.target, type=prediction_type)
+            c: performance_per_missing_value(df=self.df, feature=c, target=self.target, task=prediction_type)
             for c in cols
         })
 
         # Normalize the results with a baseline performance.
         if normalize:
-            baseline = baseline_performance(df=self.df, target=self.target, type=prediction_type)
+            baseline = baseline_performance(df=self.df, target=self.target, task=prediction_type)
             results = results / baseline
 
         return results

diff --git a/src/ydata_quality/utils/correlations.py b/src/ydata_quality/utils/correlations.py
@@ -0,0 +1,35 @@
+"""
+Utilities for feature correlations.
+"""
+
+from typing import List, Optional
+
+import numpy as np
+import pandas as pd
+
+
+def filter_associations(corrs: pd.DataFrame, th: float,
+                        name: str = 'corr', subset: Optional[List[str]] = None) -> pd.Series:
+    """Filters an association matrix for combinations above a threshold.
+
+    Args:
+        corrs (pd.DataFrame): original asssociation matrix (e.g. pandas' corr, dython's compute_associations),
+                            shape of (n_feats, n_feats) with association metric (e.g. pearson's correlation, theil's u)
+                            as values
+        th (float): filter for associations with absolute value higher than threshold
+        name (str): name of the association metric
+        subset (List[str], optional): list of feature names to subset original association values
+
+    Returns
+        corrs (pd.Series): map of feature_pair to association metric value, filtered
+    """
+    # TODO: replace in high_missing_correlations method of missings engine
+    corrs = corrs.copy() # keep original
+    np.fill_diagonal(corrs.values, np.nan) # remove the same column pairs
+    corrs = corrs[subset] if subset is not None else corrs # subset features
+    corrs = corrs[(corrs>th) | (corrs<-th)].melt(ignore_index=False).reset_index().dropna() # subset by threshold
+    corrs['features'] = ['_'.join(sorted((i.index, i.variable))) for i in corrs.itertuples()] # create the sorted pairs of feature names
+    corrs.drop_duplicates('features', inplace=True) # deduplicate combination pairs
+    corrs.sort_values(by='value', ascending=False, inplace=True) # sort by correlation
+    corrs = corrs.set_index('features').rename(columns={'value': name})[name] # rename and subset columns
+    return corrs
diff --git a/src/ydata_quality/utils/enum.py b/src/ydata_quality/utils/enum.py
@@ -4,6 +4,11 @@
 
 from enum import Enum
 
+class PredictionTask(Enum):
+    "Enum of supported prediction tasks."
+    CLASSIFICATION = 'classification'
+    REGRESSION = 'regression'
+
 class OrderedEnum(Enum):
     "Enum with support for ordering."
     def __ge__(self, other):

diff --git a/src/ydata_quality/utils/modelling.py b/src/ydata_quality/utils/modelling.py
@@ -6,9 +6,9 @@
 
 import numpy as np
 import pandas as pd
-from scipy.stats import boxcox, normaltest
+from scipy.stats import boxcox, normaltest, mode
 from sklearn.compose import ColumnTransformer
-from sklearn.exceptions import ConvergenceWarning
+from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
 from sklearn.impute import SimpleImputer
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.metrics import mean_squared_error, roc_auc_score
@@ -20,6 +20,8 @@
                                    label_binarize)
 from sklearn.utils._testing import ignore_warnings
 
+from ydata_quality.utils.enum import PredictionTask
+
 BASELINE_CLASSIFIER = Pipeline([
     ('imputer', SimpleImputer()),
     ('classifier', LogisticRegression())
@@ -40,66 +42,121 @@
 
 ORDINAL_TRANSFORMER = None  # Not implemented
 
+def get_prediction_task(df: pd.DataFrame, label: str):
+    "Heuristics to infer prediction task (classification/regression)."
+    # TODO: Improve prediction type guesstimate based on alternative heuristics (e.g. dtypes, value_counts)
+    if len(set(df[label])) == 2: # binary classification
+        return 'classification'
+    else:
+        return 'regression'
+
 @ignore_warnings(category=ConvergenceWarning)
-def baseline_predictions(df: pd.DataFrame, target: str, type='classification'):
+def baseline_predictions(df: pd.DataFrame, target: str, task='classification'):
     "Train a baseline model and predict for a test set"
 
+    # 0. Infer the prediction task
+    task = get_prediction_task(df=df, label=target)
+
     # 1. Define the baseline model
-    model = BASELINE_CLASSIFIER if type == 'classification' else BASELINE_REGRESSION
+    model = BASELINE_CLASSIFIER if task == 'classification' else BASELINE_REGRESSION
 
     # 2. Train overall model
-    X, y = df.drop(target, axis=1), df[target]
+    X, y = df.drop(target, axis=1), label_binarize(df[target], classes=list(set(df[target])))
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
     model.fit(X_train.select_dtypes('number'), y_train)
 
     # 3. Predict
-    if type == 'regression':
+    if task == 'regression':
         y_pred = model.predict(X_test.select_dtypes('number'))
-    elif type == 'classification':
+    elif task == 'classification':
         y_pred = model.predict_proba(X_test.select_dtypes('number'))[:, 1]
 
     # 4. Return both the predictions and X_test, y_test to analyze the performances
     return y_pred, X_test, y_test
 
-def baseline_performance(df: pd.DataFrame, target: str, type='classification'):
-    "Train a baseline model, predict for a test set and return the performance."
+@ignore_warnings(category=DataConversionWarning)
+def baseline_performance(df: pd.DataFrame, target: str,
+                        task: PredictionTask = PredictionTask.CLASSIFICATION,
+                        adjusted_metric: bool = False):
+    """Train a baseline model, predict for a test set and return the performance.
+
+    Args:
+        - df (pd.DataFrame): original dataset
+        - target (str): name of target feature column
+        - task (PredictionTask): classification, regression
+        - adjusted_metric (bool): if True, return metric as percentage of max achievable performance
+    """
+
+    # 0. Infer the prediction task
+    task = get_prediction_task(df=df, label=target)
 
     # 1. Define the baseline performance metric
-    metric = roc_auc_score if type == 'classification' else mean_squared_error
+    metric = roc_auc_score if task == 'classification' else mean_squared_error
 
     # 2. Get the baseline predictions
-    y_pred, _, y_test = baseline_predictions(df=df, target=target, type=type)
+    y_pred, _, y_test = baseline_predictions(df=df, target=target, task=task)
 
     # 3. Get the performance
-    return metric(y_test, y_pred)
+    if adjusted_metric:
+        perf = adjusted_performance(y_test, y_pred, task=task, metric=metric)
+    else:
+        perf = metric(y_test, y_pred)
+    return perf
 
+def adjusted_performance(y_true, y_pred, task: PredictionTask, metric: callable):
+    """Calculates the adjusted metric as ratio of real to maximum performance.
+
+    Returns the percentage to the best achievable performance starting from a baseline.
+    """
+    task = PredictionTask(task)
+    y_default = np.mean(y_true) if task == PredictionTask.CLASSIFICATION else mode(y_true).mode[0] # define the value
+    y_base = np.tile(y_default, (len(y_true), 1)) # create an array with default value
 
-def performance_per_feature_values(df: pd.DataFrame, feature: str, target: str, type='classification'):
+    best_perf = metric(y_true, y_true)
+    base_perf = metric(y_true, y_base)
+    real_perf = metric(y_true, y_pred)
+
+    return (real_perf - base_perf) / (best_perf - base_perf)
+
+
+
+@ignore_warnings(category=DataConversionWarning)
+def performance_per_feature_values(df: pd.DataFrame, feature: str, target: str, task='classification'):
     """Performance achieved per each value of a groupby feature."""
 
+    # 0. Infer the prediction task
+    task = get_prediction_task(df=df, label=target)
+
     # 1. Define the baseline performance metric
-    metric = roc_auc_score if type == 'classification' else mean_squared_error
+    metric = roc_auc_score if task == 'classification' else mean_squared_error
 
     # 2. Get the baseline predictions
-    y_pred, X_test, y_test = baseline_predictions(df=df, target=target, type=type)
+    y_pred, X_test, y_test = baseline_predictions(df=df, target=target, task=task)
 
     # 3. Get the performances per feature value
     uniques = set(X_test[feature])
     results =  {}
     for i in uniques: # for each category
         y_pred_cat = y_pred[X_test[feature]==i]
         y_true_cat = y_test[X_test[feature]==i]
-        results[i] = metric(y_true_cat, y_pred_cat)
+        try:
+            results[i] = metric(y_true_cat, y_pred_cat)
+        except Exception as exc:
+            results[i] = f'[ERROR] Failed performance metric with message: {exc}'
+
     return results
 
-def performance_per_missing_value(df: pd.DataFrame, feature: str, target: str, type='classification'):
+def performance_per_missing_value(df: pd.DataFrame, feature: str, target: str, task='classification'):
     """Performance difference between valued and missing values in feature."""
 
+    # 0. Infer the prediction task
+    task = get_prediction_task(df=df, label=target)
+
     # 1. Define the baseline performance metric
-    metric = roc_auc_score if type == 'classification' else mean_squared_error
+    metric = roc_auc_score if task == 'classification' else mean_squared_error
 
     # 2. Get the baseline predictions
-    y_pred, X_test, y_test = baseline_predictions(df=df, target=target, type=type)
+    y_pred, X_test, y_test = baseline_predictions(df=df, target=target, task=task)
 
     # 3. Get the performance per valued vs missing feature
     missing_mask = X_test[feature].isna()