Skip to content

Issue 265 privileged class bank dataset #449

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
wget ${UCI_DB}/statlog/german/german.data -P aif360/data/raw/german/
wget ${UCI_DB}/statlog/german/german.doc -P aif360/data/raw/german/
wget ${PROPUBLICA_GH}/compas-scores-two-years.csv -P aif360/data/raw/compas/
wget ${UCI_DB}/00222/bank-additional.zip -P aif360/data/raw/bank/ && unzip -j aif360/data/raw/bank/bank-additional.zip -d aif360/data/raw/bank/ && rm aif360/data/raw/bank/bank-additional.zip
(cd aif360/data/raw/meps;Rscript generate_data.R <<< y)

- name: Lint with flake8
Expand Down
2 changes: 1 addition & 1 deletion aif360/data/raw/bank/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Additional information on dataset and features is available in `bank-additional-

1. Download the file [bank-additional.zip](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip).

2. Extract files from the downloaded archive and place the files 'bank-additional.csv' and 'bank-additional-names.txt' into the current folder.
2. Extract files from the downloaded archive and place the files 'bank-additional-full.csv' and 'bank-additional-names.txt' into the current folder.

## Relevant Papers

Expand Down
13 changes: 10 additions & 3 deletions aif360/datasets/bank_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class BankDataset(StandardDataset):

def __init__(self, label_name='y', favorable_classes=['yes'],
protected_attribute_names=['age'],
privileged_classes=[lambda x: x >= 25],
privileged_classes=[lambda x: x >= 25 and x < 60],
instance_weights_name=None,
categorical_features=['job', 'marital', 'education', 'default',
'housing', 'loan', 'contact', 'month', 'day_of_week',
Expand All @@ -24,8 +24,15 @@ def __init__(self, label_name='y', favorable_classes=['yes'],
"""See :obj:`StandardDataset` for a description of the arguments.

By default, this code converts the 'age' attribute to a binary value
where privileged is `age >= 25` and unprivileged is `age < 25` as in
:obj:`GermanDataset`.
where privileged is `25 <= age < 60` and unprivileged is `age < 25` or `age >= 60`
as suggested in Le Quy, Tai, et al. [1].

References:
.. [1] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine
learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge
Discovery 12.3 (2022): e1452.


"""

filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
Expand Down
27 changes: 21 additions & 6 deletions aif360/sklearn/datasets/openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,19 @@ def fetch_german(*, data_home=None, cache=True, binary_age=True, usecols=None,
dropcols=dropcols, numeric_only=numeric_only,
dropna=dropna)

def fetch_bank(*, data_home=None, cache=True, percent10=False, usecols=None,
dropcols=['duration'], numeric_only=False, dropna=False):
def fetch_bank(*, data_home=None, cache=True, binary_age=True, percent10=False,
usecols=None, dropcols=['duration'], numeric_only=False, dropna=False):
"""Load the Bank Marketing Dataset.

The protected attribute is 'age' (left as continuous). The outcome variable
is 'deposit': 'yes' or 'no'.
The protected attribute is 'age' (binarized by default as suggested by [#lequy22]:
age >= 25 and age <60 is considered privileged and age< 25 or age >= 60 unprivileged;
see the binary_age flag to keep this continuous). The outcome variable is 'deposit':
'yes' or 'no'.

References:
.. [#lequy22] Le Quy, Tai, et al. "A survey on datasets for fairness‐aware machine
learning." Wiley Interdisciplinary Reviews: Data Mining and Knowledge
Discovery 12.3 (2022): e1452.

Note:
By default, the data is downloaded from OpenML. See the `bank-marketing
Expand Down Expand Up @@ -228,7 +235,15 @@ def fetch_bank(*, data_home=None, cache=True, percent10=False, usecols=None,
df[col] = df[col].cat.remove_categories('unknown')
df.education = df.education.astype('category').cat.reorder_categories(
['primary', 'secondary', 'tertiary'], ordered=True)

return standardize_dataset(df, prot_attr='age', target='deposit',

# binarize protected attribute (but not corresponding feature)
age = (pd.cut(df.age, [0, 24, 60, 100], ordered=False,
labels=[0, 1, 0] if numeric_only
else ['<25 or >=60', '25-60', '<25 or >=60'])
if binary_age else 'age')
age = age.cat.reorder_categories([0, 1] if numeric_only
else ['<25 or >=60', '25-60'])

return standardize_dataset(df, prot_attr=[age], target='deposit',
usecols=usecols, dropcols=dropcols,
numeric_only=numeric_only, dropna=dropna)
5 changes: 5 additions & 0 deletions tests/notebook_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@

def notebook_run(path):
"""Execute a notebook via nbconvert and collect output.
Reset cwd after execution.
:returns (parsed nb object, execution errors)
"""
old_cwd = os.getcwd()

dirname, __ = os.path.split(path)
os.chdir(dirname)

Expand All @@ -31,5 +34,7 @@ def notebook_run(path):
errors = [output for cell in nb.cells if "outputs" in cell
for output in cell["outputs"]
if output.output_type == "error"]

os.chdir(old_cwd)

return nb, errors
66 changes: 48 additions & 18 deletions tests/test_standard_datasets.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,74 @@
""" Tests for standard dataset classes """

from unittest.mock import patch
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 200)
import os

from aif360.datasets import AdultDataset
from aif360.datasets import BankDataset
from aif360.datasets import CompasDataset
from aif360.datasets import GermanDataset
from aif360.metrics import BinaryLabelDatasetMetric

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 200)

def test_compas():
''' Test default loading for compas '''
# just test that there are no errors for default loading...
cd = CompasDataset()
# print(cd)
compas_dataset = CompasDataset()
compas_dataset.validate_dataset()

def test_german():
gd = GermanDataset()
bldm = BinaryLabelDatasetMetric(gd)
''' Test default loading for german '''
german_dataset = GermanDataset()
bldm = BinaryLabelDatasetMetric(german_dataset)
assert bldm.num_instances() == 1000

def test_adult_test_set():
ad = AdultDataset()
# test, train = ad.split([16281])
test, train = ad.split([15060])
''' Test default loading for adult, test set '''
adult_dataset = AdultDataset()
test, _ = adult_dataset.split([15060])
assert np.any(test.labels)

def test_adult():
ad = AdultDataset()
# print(ad.feature_names)
assert np.isclose(ad.labels.mean(), 0.2478, atol=5e-5)

bldm = BinaryLabelDatasetMetric(ad)
''' Test default loading for adult, mean'''
adult_dataset = AdultDataset()
assert np.isclose(adult_dataset.labels.mean(), 0.2478, atol=5e-5)
bldm = BinaryLabelDatasetMetric(adult_dataset)
assert bldm.num_instances() == 45222

def test_adult_no_drop():
ad = AdultDataset(protected_attribute_names=['sex'],
''' Test default loading for adult, number of instances '''
adult_dataset = AdultDataset(protected_attribute_names=['sex'],
privileged_classes=[['Male']], categorical_features=[],
features_to_keep=['age', 'education-num'])
bldm = BinaryLabelDatasetMetric(ad)
bldm = BinaryLabelDatasetMetric(adult_dataset)
assert bldm.num_instances() == 48842

def test_bank():
''' Test for errors during default loading '''
bank_dataset = BankDataset()
bank_dataset.validate_dataset()

def test_bank_priviliged_attributes():
''' Test if protected attribute age is correctly processed '''
# Bank Data Set
bank_dataset = BankDataset()
num_priv = bank_dataset.protected_attributes.sum()

# Raw data
# TO DO: add file path.
filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'aif360', 'data', 'raw', 'bank', 'bank-additional-full.csv')

bank_dataset_unpreproc = pd.read_csv(filepath, sep = ";", na_values = ["unknown"])
bank_dataset_unpreproc = bank_dataset_unpreproc.dropna()
num_priv_raw = len(bank_dataset_unpreproc[(bank_dataset_unpreproc["age"] >= 25) & (bank_dataset_unpreproc["age"] < 60)])
assert num_priv == num_priv_raw