generated from ydataai/opensource-template
-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(bias-fairness): added engine mvp (#14)
Features: - Performance Discrimination - Proxy Identification - Sensitive Predictability - Sensitive Representativity - (utils) PredictionTask enum - (modelling) added adjusted performance
- Loading branch information
UrbanoFonseca
authored
Sep 8, 2021
1 parent
ffac9f2
commit 56b7340
Showing
9 changed files
with
10,811 additions
and
21 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,4 @@ pandas==1.2.* | |
pydantic==1.8.2 | ||
scikit-learn==0.24.2 | ||
matplotlib==3.4.2 | ||
dython==0.6.7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
""" | ||
Tools to check dataset for bias and fairness. | ||
""" | ||
from ydata_quality.bias_fairness.engine import BiasFairness | ||
|
||
__all__ = [ | ||
"BiasFairness" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
""" | ||
Implementation of BiasFairness engine to run bias and fairness analysis. | ||
""" | ||
|
||
from typing import List, Optional | ||
|
||
import pandas as pd | ||
from dython.nominal import compute_associations | ||
from ydata_quality.core import QualityEngine, QualityWarning | ||
from ydata_quality.utils.correlations import filter_associations | ||
from ydata_quality.utils.modelling import (baseline_performance, | ||
performance_per_feature_values) | ||
|
||
|
||
class BiasFairness(QualityEngine): | ||
""" Engine to run bias and fairness analysis. | ||
Tests: | ||
- Proxy Identification: tests for high correlation between sensitive and non-sensitive features | ||
- Sensitive Predictability: trains a baseline model to predict sensitive attributes | ||
- Performance Discrimination: checks for performance disparities on sensitive attributes | ||
""" | ||
|
||
def __init__(self, df: pd.DataFrame, sensitive_features: List[str], label: Optional[str] = None): | ||
""" | ||
Args | ||
df (pd.DataFrame): reference DataFrame used to run the analysis | ||
sensitive_features (List[str]): features deemed as sensitive attributes | ||
label (str, optional): target feature to be predicted | ||
""" | ||
super().__init__(df=df, label=label) | ||
self._sensitive_features = sensitive_features | ||
self._tests = ["performance_discrimination", "proxy_identification", | ||
"sensitive_predictability", "sensitive_representativity"] | ||
|
||
@property | ||
def sensitive_features(self): | ||
"Returns a list of sensitive features." | ||
return self._sensitive_features | ||
|
||
def proxy_identification(self, th=0.5): | ||
"""Tests for non-protected features high correlation with sensitive attributes. | ||
Non-sensitive features can serve as proxy for protected attributes, exposing the data to a possible | ||
subsequent bias in the data pipeline. High association values indicate that alternative features can | ||
be used in place of the original sensitive attributes. | ||
""" | ||
# TODO: multiple thresholds per association type (num/num, num/cat, cat/cat) | ||
|
||
# Compute association measures for sensitive features | ||
corrs = compute_associations(self.df, num_num_assoc='pearson',nom_nom_assoc='cramer') | ||
corrs = filter_associations(corrs, th=th, name='association', subset=self.sensitive_features) | ||
|
||
if len(corrs) > 0: | ||
self.store_warning( | ||
QualityWarning( | ||
test='Proxy Identification', category='Bias&Fairness', priority=2, data=corrs, | ||
description=f"Found {len(corrs)} feature pairs of correlation "\ | ||
f"to sensitive attributes with values higher than defined threshold ({th})." | ||
)) | ||
return corrs | ||
|
||
|
||
def sensitive_predictability(self, th=0.5, adjusted_metric=True): | ||
"""Trains a baseline classifier to predict sensitive attributes based on remaining features. | ||
Good performances indicate that alternative features may be working as proxies for sensitive attributes. | ||
""" | ||
drop_features = self.sensitive_features + [self.label] # features to remove in prediction | ||
|
||
performances = pd.Series(index=self.sensitive_features) | ||
for feat in performances.index: | ||
data = self.df.drop(columns=[x for x in drop_features if x != feat]) # drop all except target | ||
performances[feat] = baseline_performance(df=data, target=feat, adjusted_metric=adjusted_metric) | ||
|
||
high_perfs = performances[performances>th] | ||
if len(high_perfs) > 0: | ||
self.store_warning( | ||
QualityWarning( | ||
test='Sensitive Attribute Predictability', category='Bias&Fairness', priority=3, data=high_perfs, | ||
description=f"Found {len(high_perfs)} sensitive attribute(s) with high predictability performance"\ | ||
f" (greater than {th})." | ||
) | ||
) | ||
return performances | ||
|
||
def performance_discrimination(self): | ||
"""Checks for performance disparities for sensitive attributes. | ||
Get the performance of a baseline model for each feature value of a sensitive attribute. | ||
High disparities in the performance metrics indicate that the model may not be fair across sensitive attributes. | ||
""" | ||
# TODO: support error rate parity metrics (e.g. false positive rate, positive rate) | ||
if self.label is None: | ||
print('Argument "label" must be defined to calculate performance discrimination metric. Skipping test.') | ||
pass | ||
|
||
res = {} | ||
for feat in self.sensitive_features: | ||
res[feat] = pd.Series(performance_per_feature_values(df=self.df, feature=feat, target=self.label)) | ||
return res | ||
|
||
|
||
def sensitive_representativity(self, min_pct: float = 0.01): | ||
"""Checks categorical sensitive attributes minimum representativity of feature values. | ||
Raises a warning if a feature value of a categorical sensitive attribute is not represented above a min_pct percentage. | ||
""" | ||
# TODO: Representativity for numerical features | ||
res = {} | ||
categorical_sensitives = [k for (k,v) in self.dtypes.items() if (v == 'categorical') & (k in self.sensitive_features)] | ||
for cat in categorical_sensitives: | ||
dist = self.df[cat].value_counts(normalize=True) # normalized presence of feature values | ||
res[cat] = dist # store the distribution | ||
low_dist = dist[dist<min_pct] # filter for low representativity | ||
if len(low_dist) > 0: | ||
self.store_warning( | ||
QualityWarning( | ||
test='Sensitive Attribute Representativity', category='Bias&Fairness', priority=2, data=low_dist, | ||
description=f"Found {len(low_dist)} values of '{cat}' sensitive attribute with low representativity"\ | ||
f" in the dataset (below {min_pct*100:.2f}%)." | ||
) | ||
) | ||
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
""" | ||
Utilities for feature correlations. | ||
""" | ||
|
||
from typing import List, Optional | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def filter_associations(corrs: pd.DataFrame, th: float, | ||
name: str = 'corr', subset: Optional[List[str]] = None) -> pd.Series: | ||
"""Filters an association matrix for combinations above a threshold. | ||
Args: | ||
corrs (pd.DataFrame): original asssociation matrix (e.g. pandas' corr, dython's compute_associations), | ||
shape of (n_feats, n_feats) with association metric (e.g. pearson's correlation, theil's u) | ||
as values | ||
th (float): filter for associations with absolute value higher than threshold | ||
name (str): name of the association metric | ||
subset (List[str], optional): list of feature names to subset original association values | ||
Returns | ||
corrs (pd.Series): map of feature_pair to association metric value, filtered | ||
""" | ||
# TODO: replace in high_missing_correlations method of missings engine | ||
corrs = corrs.copy() # keep original | ||
np.fill_diagonal(corrs.values, np.nan) # remove the same column pairs | ||
corrs = corrs[subset] if subset is not None else corrs # subset features | ||
corrs = corrs[(corrs>th) | (corrs<-th)].melt(ignore_index=False).reset_index().dropna() # subset by threshold | ||
corrs['features'] = ['_'.join(sorted((i.index, i.variable))) for i in corrs.itertuples()] # create the sorted pairs of feature names | ||
corrs.drop_duplicates('features', inplace=True) # deduplicate combination pairs | ||
corrs.sort_values(by='value', ascending=False, inplace=True) # sort by correlation | ||
corrs = corrs.set_index('features').rename(columns={'value': name})[name] # rename and subset columns | ||
return corrs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.