Skip to content

Commit

Permalink
feat(bias-fairness): added engine mvp (#14)
Browse files Browse the repository at this point in the history
Features:
- Performance Discrimination
- Proxy Identification
- Sensitive Predictability
- Sensitive Representativity
- (utils) PredictionTask enum
- (modelling) added adjusted performance
  • Loading branch information
UrbanoFonseca authored Sep 8, 2021
1 parent ffac9f2 commit 56b7340
Show file tree
Hide file tree
Showing 9 changed files with 10,811 additions and 21 deletions.
10,001 changes: 10,001 additions & 0 deletions examples/census/census_10k.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ pandas==1.2.*
pydantic==1.8.2
scikit-learn==0.24.2
matplotlib==3.4.2
dython==0.6.7
8 changes: 8 additions & 0 deletions src/ydata_quality/bias_fairness/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""
Tools to check dataset for bias and fairness.
"""
from ydata_quality.bias_fairness.engine import BiasFairness

__all__ = [
"BiasFairness"
]
124 changes: 124 additions & 0 deletions src/ydata_quality/bias_fairness/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
Implementation of BiasFairness engine to run bias and fairness analysis.
"""

from typing import List, Optional

import pandas as pd
from dython.nominal import compute_associations
from ydata_quality.core import QualityEngine, QualityWarning
from ydata_quality.utils.correlations import filter_associations
from ydata_quality.utils.modelling import (baseline_performance,
performance_per_feature_values)


class BiasFairness(QualityEngine):
""" Engine to run bias and fairness analysis.
Tests:
- Proxy Identification: tests for high correlation between sensitive and non-sensitive features
- Sensitive Predictability: trains a baseline model to predict sensitive attributes
- Performance Discrimination: checks for performance disparities on sensitive attributes
"""

def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None):
"""
Args
df (pd.DataFrame): reference DataFrame used to run the analysis
sensitive_features (List[str]): features deemed as sensitive attributes
label (str, optional): target feature to be predicted
"""
super().__init__(df=df, label=label)
self._sensitive_features = sensitive_features
self._tests = ["performance_discrimination", "proxy_identification",
"sensitive_predictability", "sensitive_representativity"]

@property
def sensitive_features(self):
"Returns a list of sensitive features."
return self._sensitive_features

def proxy_identification(self, th=0.5):
"""Tests for non-protected features high correlation with sensitive attributes.
Non-sensitive features can serve as proxy for protected attributes, exposing the data to a possible
subsequent bias in the data pipeline. High association values indicate that alternative features can
be used in place of the original sensitive attributes.
"""
# TODO: multiple thresholds per association type (num/num, num/cat, cat/cat)

# Compute association measures for sensitive features
corrs = compute_associations(self.df, num_num_assoc='pearson',nom_nom_assoc='cramer')
corrs = filter_associations(corrs, th=th, name='association', subset=self.sensitive_features)

if len(corrs) > 0:
self.store_warning(
QualityWarning(
test='Proxy Identification', category='Bias&Fairness', priority=2, data=corrs,
description=f"Found {len(corrs)} feature pairs of correlation "\
f"to sensitive attributes with values higher than defined threshold ({th})."
))
return corrs


def sensitive_predictability(self, th=0.5, adjusted_metric=True):
"""Trains a baseline classifier to predict sensitive attributes based on remaining features.
Good performances indicate that alternative features may be working as proxies for sensitive attributes.
"""
drop_features = self.sensitive_features + [self.label] # features to remove in prediction

performances = pd.Series(index=self.sensitive_features)
for feat in performances.index:
data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target
performances[feat] = baseline_performance(df=data, target=feat, adjusted_metric=adjusted_metric)

high_perfs = performances[performances>th]
if len(high_perfs) > 0:
self.store_warning(
QualityWarning(
test='Sensitive Attribute Predictability', category='Bias&Fairness', priority=3, data=high_perfs,
description=f"Found {len(high_perfs)} sensitive attribute(s) with high predictability performance"\
f" (greater than {th})."
)
)
return performances

def performance_discrimination(self):
"""Checks for performance disparities for sensitive attributes.
Get the performance of a baseline model for each feature value of a sensitive attribute.
High disparities in the performance metrics indicate that the model may not be fair across sensitive attributes.
"""
# TODO: support error rate parity metrics (e.g. false positive rate, positive rate)
if self.label is None:
print('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.')
pass

res = {}
for feat in self.sensitive_features:
res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, target=self.label))
return res


def sensitive_representativity(self, min_pct: float = 0.01):
"""Checks categorical sensitive attributes minimum representativity of feature values.
Raises a warning if a feature value of a categorical sensitive attribute is not represented above a min_pct percentage.
"""
# TODO: Representativity for numerical features
res = {}
categorical_sensitives = [k for (k,v) in self.dtypes.items() if (v == 'categorical') & (k in self.sensitive_features)]
for cat in categorical_sensitives:
dist = self.df[cat].value_counts(normalize=True) # normalized presence of feature values
res[cat] = dist # store the distribution
low_dist = dist[dist<min_pct] # filter for low representativity
if len(low_dist) > 0:
self.store_warning(
QualityWarning(
test='Sensitive Attribute Representativity', category='Bias&Fairness', priority=2, data=low_dist,
description=f"Found {len(low_dist)} values of '{cat}' sensitive attribute with low representativity"\
f" in the dataset (below {min_pct*100:.2f}%)."
)
)
return res
5 changes: 3 additions & 2 deletions src/ydata_quality/missings/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(self, df: pd.DataFrame, target: Optional[str] = None):
df (pd.DataFrame): reference DataFrame used to run the missing value analysis.
target (str, optional): target
"""
#TODO: Rename 'target' argument to 'label' standard of QualityEngine
super().__init__(df=df)
self._target = target
self._tests = ["nulls_higher_than", "high_missing_correlations", "predict_missings"]
Expand Down Expand Up @@ -135,13 +136,13 @@ def performance_drop(self, col: Union[List[str], str, None] = None, normalize=Tr
# Guesstimate the prediction type
prediction_type = self.__get_prediction_type()
results = pd.DataFrame({
c: performance_per_missing_value(df=self.df, feature=c, target=self.target, type=prediction_type)
c: performance_per_missing_value(df=self.df, feature=c, target=self.target, task=prediction_type)
for c in cols
})

# Normalize the results with a baseline performance.
if normalize:
baseline = baseline_performance(df=self.df, target=self.target, type=prediction_type)
baseline = baseline_performance(df=self.df, target=self.target, task=prediction_type)
results = results / baseline

return results
Expand Down
35 changes: 35 additions & 0 deletions src/ydata_quality/utils/correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
Utilities for feature correlations.
"""

from typing import List, Optional

import numpy as np
import pandas as pd


def filter_associations(corrs: pd.DataFrame, th: float,
name: str = 'corr', subset: Optional[List[str]] = None) -> pd.Series:
"""Filters an association matrix for combinations above a threshold.
Args:
corrs (pd.DataFrame): original asssociation matrix (e.g. pandas' corr, dython's compute_associations),
shape of (n_feats, n_feats) with association metric (e.g. pearson's correlation, theil's u)
as values
th (float): filter for associations with absolute value higher than threshold
name (str): name of the association metric
subset (List[str], optional): list of feature names to subset original association values
Returns
corrs (pd.Series): map of feature_pair to association metric value, filtered
"""
# TODO: replace in high_missing_correlations method of missings engine
corrs = corrs.copy() # keep original
np.fill_diagonal(corrs.values, np.nan) # remove the same column pairs
corrs = corrs[subset] if subset is not None else corrs # subset features
corrs = corrs[(corrs>th) | (corrs<-th)].melt(ignore_index=False).reset_index().dropna() # subset by threshold
corrs['features'] = ['_'.join(sorted((i.index, i.variable))) for i in corrs.itertuples()] # create the sorted pairs of feature names
corrs.drop_duplicates('features', inplace=True) # deduplicate combination pairs
corrs.sort_values(by='value', ascending=False, inplace=True) # sort by correlation
corrs = corrs.set_index('features').rename(columns={'value': name})[name] # rename and subset columns
return corrs
5 changes: 5 additions & 0 deletions src/ydata_quality/utils/enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

from enum import Enum

class PredictionTask(Enum):
"Enum of supported prediction tasks."
CLASSIFICATION = 'classification'
REGRESSION = 'regression'

class OrderedEnum(Enum):
"Enum with support for ordering."
def __ge__(self, other):
Expand Down
95 changes: 76 additions & 19 deletions src/ydata_quality/utils/modelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

import numpy as np
import pandas as pd
from scipy.stats import boxcox, normaltest
from scipy.stats import boxcox, normaltest, mode
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, roc_auc_score
Expand All @@ -20,6 +20,8 @@
label_binarize)
from sklearn.utils._testing import ignore_warnings

from ydata_quality.utils.enum import PredictionTask

BASELINE_CLASSIFIER = Pipeline([
('imputer', SimpleImputer()),
('classifier', LogisticRegression())
Expand All @@ -40,66 +42,121 @@

ORDINAL_TRANSFORMER = None # Not implemented

def get_prediction_task(df: pd.DataFrame, label: str):
"Heuristics to infer prediction task (classification/regression)."
# TODO: Improve prediction type guesstimate based on alternative heuristics (e.g. dtypes, value_counts)
if len(set(df[label])) == 2: # binary classification
return 'classification'
else:
return 'regression'

@ignore_warnings(category=ConvergenceWarning)
def baseline_predictions(df: pd.DataFrame, target: str, type='classification'):
def baseline_predictions(df: pd.DataFrame, target: str, task='classification'):
"Train a baseline model and predict for a test set"

# 0. Infer the prediction task
task = get_prediction_task(df=df, label=target)

# 1. Define the baseline model
model = BASELINE_CLASSIFIER if type == 'classification' else BASELINE_REGRESSION
model = BASELINE_CLASSIFIER if task == 'classification' else BASELINE_REGRESSION

# 2. Train overall model
X, y = df.drop(target, axis=1), df[target]
X, y = df.drop(target, axis=1), label_binarize(df[target], classes=list(set(df[target])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model.fit(X_train.select_dtypes('number'), y_train)

# 3. Predict
if type == 'regression':
if task == 'regression':
y_pred = model.predict(X_test.select_dtypes('number'))
elif type == 'classification':
elif task == 'classification':
y_pred = model.predict_proba(X_test.select_dtypes('number'))[:, 1]

# 4. Return both the predictions and X_test, y_test to analyze the performances
return y_pred, X_test, y_test

def baseline_performance(df: pd.DataFrame, target: str, type='classification'):
"Train a baseline model, predict for a test set and return the performance."
@ignore_warnings(category=DataConversionWarning)
def baseline_performance(df: pd.DataFrame, target: str,
task: PredictionTask = PredictionTask.CLASSIFICATION,
adjusted_metric: bool = False):
"""Train a baseline model, predict for a test set and return the performance.
Args:
- df (pd.DataFrame): original dataset
- target (str): name of target feature column
- task (PredictionTask): classification, regression
- adjusted_metric (bool): if True, return metric as percentage of max achievable performance
"""

# 0. Infer the prediction task
task = get_prediction_task(df=df, label=target)

# 1. Define the baseline performance metric
metric = roc_auc_score if type == 'classification' else mean_squared_error
metric = roc_auc_score if task == 'classification' else mean_squared_error

# 2. Get the baseline predictions
y_pred, _, y_test = baseline_predictions(df=df, target=target, type=type)
y_pred, _, y_test = baseline_predictions(df=df, target=target, task=task)

# 3. Get the performance
return metric(y_test, y_pred)
if adjusted_metric:
perf = adjusted_performance(y_test, y_pred, task=task, metric=metric)
else:
perf = metric(y_test, y_pred)
return perf

def adjusted_performance(y_true, y_pred, task: PredictionTask, metric: callable):
"""Calculates the adjusted metric as ratio of real to maximum performance.
Returns the percentage to the best achievable performance starting from a baseline.
"""
task = PredictionTask(task)
y_default = np.mean(y_true) if task == PredictionTask.CLASSIFICATION else mode(y_true).mode[0] # define the value
y_base = np.tile(y_default, (len(y_true), 1)) # create an array with default value

def performance_per_feature_values(df: pd.DataFrame, feature: str, target: str, type='classification'):
best_perf = metric(y_true, y_true)
base_perf = metric(y_true, y_base)
real_perf = metric(y_true, y_pred)

return (real_perf - base_perf) / (best_perf - base_perf)



@ignore_warnings(category=DataConversionWarning)
def performance_per_feature_values(df: pd.DataFrame, feature: str, target: str, task='classification'):
"""Performance achieved per each value of a groupby feature."""

# 0. Infer the prediction task
task = get_prediction_task(df=df, label=target)

# 1. Define the baseline performance metric
metric = roc_auc_score if type == 'classification' else mean_squared_error
metric = roc_auc_score if task == 'classification' else mean_squared_error

# 2. Get the baseline predictions
y_pred, X_test, y_test = baseline_predictions(df=df, target=target, type=type)
y_pred, X_test, y_test = baseline_predictions(df=df, target=target, task=task)

# 3. Get the performances per feature value
uniques = set(X_test[feature])
results = {}
for i in uniques: # for each category
y_pred_cat = y_pred[X_test[feature]==i]
y_true_cat = y_test[X_test[feature]==i]
results[i] = metric(y_true_cat, y_pred_cat)
try:
results[i] = metric(y_true_cat, y_pred_cat)
except Exception as exc:
results[i] = f'[ERROR] Failed performance metric with message: {exc}'

return results

def performance_per_missing_value(df: pd.DataFrame, feature: str, target: str, type='classification'):
def performance_per_missing_value(df: pd.DataFrame, feature: str, target: str, task='classification'):
"""Performance difference between valued and missing values in feature."""

# 0. Infer the prediction task
task = get_prediction_task(df=df, label=target)

# 1. Define the baseline performance metric
metric = roc_auc_score if type == 'classification' else mean_squared_error
metric = roc_auc_score if task == 'classification' else mean_squared_error

# 2. Get the baseline predictions
y_pred, X_test, y_test = baseline_predictions(df=df, target=target, type=type)
y_pred, X_test, y_test = baseline_predictions(df=df, target=target, task=task)

# 3. Get the performance per valued vs missing feature
missing_mask = X_test[feature].isna()
Expand Down
Loading

0 comments on commit 56b7340

Please sign in to comment.