From 58caff32d6ba6bb0b4df3fe58306984595cb2ffd Mon Sep 17 00:00:00 2001 From: Eric Smith Date: Fri, 21 Oct 2022 15:47:08 -0400 Subject: [PATCH 1/4] Add name processing code --- projects/dialogue_bias/README.md | 1 + projects/dialogue_bias/util.py | 212 +++++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+) create mode 100644 projects/dialogue_bias/util.py diff --git a/projects/dialogue_bias/README.md b/projects/dialogue_bias/README.md index fb85f6b2e81..b349aef79df 100644 --- a/projects/dialogue_bias/README.md +++ b/projects/dialogue_bias/README.md @@ -13,6 +13,7 @@ All AI models are susceptible to learning biases in data that they are trained o ## Code - `projects.dialogue_bias.agents:NoBiasStyleGenAgent`: Agent that appends a `"no_bias"` string to the context of every example in order to perform controllable generation. +- In `util.py`, `get_gender_name_list()` and `get_race_ethnicity_gender_name_list()` will process and output the two sets of names used in our work. They require as inputs the raw datasets of names from the papers that we drew names from: see docstrings for details. ## Models diff --git a/projects/dialogue_bias/util.py b/projects/dialogue_bias/util.py new file mode 100644 index 00000000000..7d6e33e1621 --- /dev/null +++ b/projects/dialogue_bias/util.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +from collections import defaultdict +from typing import Dict, List, Tuple + +import pandas as pd + + +RACES_ETHNICITIES = ['hispanic', 'white', 'black', 'api', 'aian', '2prace'] +RACES_ETHNICITIES_WITH_NAMES = ['hispanic', 'white', 'black', 'api'] +# Some races/ethnicities don't have any names on the Tzioumis list for which they are +# the plurality race/ethnicity; we exclude those races/ethnicities from this list. + + +def get_gender_name_list(gender: str, names_path: str) -> List[str]: + """ + Return a list of names of the specified gender from Newman, et al. + + Read names from https://journals.sagepub.com/doi/abs/10.1177/0146167218769858 and + filter by the specified gender. + """ + name_df = pd.read_csv(names_path) + names = name_df[lambda df: df['Gender'].str.lower() == gender][ + 'Name' + ].values.tolist() + print(f'Using {len(names):d} {gender} names: ' + ', '.join(names)) + return names + + +def get_race_ethnicity_gender_name_list( + baby_name_folder: str, + tzioumis_data_path: str, + race_gender_name_lists: Dict[str, List[str]], +) -> Tuple[Dict[str, str], Dict[str, List[str]]]: + """ + Get name lists split by both race/ethnicity and by gender, given input name lists + (see inner functions for the sources of these lists). + """ + + baby_name_counts_by_gender = get_baby_name_counts_by_gender(baby_name_folder) + + # Split original name lists by gender + orig_name_lists = get_tzioumis_name_lists( + tzioumis_data_path=tzioumis_data_path, + race_gender_name_lists=race_gender_name_lists, + ) + print('\nSplitting original name lists by gender.') + names_to_new_lists = {} + for name_list, names in orig_name_lists.items(): + for name in names: + proc_name = name.replace('-', '') + proc_name = proc_name[0].upper() + proc_name[1:].lower() + if ( + baby_name_counts_by_gender[proc_name]['F'] + > baby_name_counts_by_gender[proc_name]['M'] + ): + names_to_new_lists[name] = f'{name_list}_female' + elif ( + baby_name_counts_by_gender[proc_name]['M'] + > baby_name_counts_by_gender[proc_name]['F'] + ): + names_to_new_lists[name] = f'{name_list}_male' + else: + # Tie + names_to_new_lists[name] = f'{name_list}_unknown' + assert len(names_to_new_lists) == sum( + [len(names) for names in orig_name_lists.values()] + ) + + # Print the names in each of the new name lists + new_name_lists = defaultdict(list) + for name, name_list in names_to_new_lists.items(): + new_name_lists[name_list].append(name) + for name_list in sorted(new_name_lists.keys()): + sorted_names = sorted(new_name_lists[name_list]) + print( + f'\nUsing {len(sorted_names):d} names for the {name_list} name list: ' + + ', '.join(sorted_names) + ) + + return names_to_new_lists, new_name_lists + + +def get_baby_name_counts_by_gender(baby_name_folder: str) -> Dict[str, Dict[str, int]]: + """ + Return a dictionary whose keys are baby names and whose values are counts of the + number of babies given that name, split by gender. + + Baby name folder from https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-data, accessed 2021-04-02. + """ + + # Params + final_baby_name_year = 2019 + baby_name_year_range = range(final_baby_name_year - 99, final_baby_name_year + 1) + # Get the most recent 100 years of names + + # Get counts of baby names by gender + baby_name_counts_by_gender = defaultdict(lambda: {'F': 0, 'M': 0}) + for year in baby_name_year_range: + counts_path = os.path.join(baby_name_folder, f'yob{year:d}.txt') + with open(counts_path) as f: + for line in f: + name, gender, count_string = line.split(',') + count = int(count_string.rstrip()) + baby_name_counts_by_gender[name][gender] += count + + return baby_name_counts_by_gender + + +def get_tzioumis_name_lists( + tzioumis_data_path: str, + race_gender_name_lists: Dict[str, List[str]], +) -> Dict[str, List[str]]: + """ + Get race/ethnicity name lists from the Tzioumis work. + """ + percent_df = load_tzioumis_data(tzioumis_data_path) + name_lists = {} + for race_ethnicity in RACES_ETHNICITIES_WITH_NAMES: + name_lists[race_ethnicity] = get_race_ethnicity_name_list_given_tzioumis_data( + percent_df=percent_df, + race_gender_name_lists=race_gender_name_lists, + race_ethnicity=race_ethnicity, + ) + return name_lists + + +def load_tzioumis_data(tzioumis_data_path: str) -> pd.DataFrame: + """ + Load Tzioumis data from + https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi%3A10.7910%2FDVN%2FTYJKEZ + (Paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5839157/) + """ + print(f'Loading Tzioumis data from {tzioumis_data_path}') + percent_df = ( + pd.read_excel(tzioumis_data_path, sheet_name='Data')[ + lambda df: df['firstname'] != 'ALL OTHER FIRST NAMES' + ] + .assign( + firstname=lambda df: df['firstname'].apply(lambda s: s[0] + s[1:].lower()) + ) + .set_index('firstname') + ) + return percent_df + + +def get_race_ethnicity_name_list_given_tzioumis_data( + percent_df: pd.DataFrame, + race_gender_name_lists: Dict[str, List[str]], + race_ethnicity: str, +) -> List[str]: + """ + Given input Tzioumis data (percent_df) and a set of names split by gender and + race/ethnicity from Milkman et al. (2012), Caliskan et al. (2017), and Guo and + Caliskan (2020)., get a list of names for the given race/ethnicity. + """ + + # Params + tzioumis_to_race_gender_mapping = {'hispanic': 'his', 'white': 'ea', 'black': 'aa'} + + # Determine which names are most commonly of the specified race/ethnicity (i.e. + # plurality), and pick the 200 of those that have the most observations for that + # ethnicity + this_ethnicity_column = f'pct{race_ethnicity}' + percent_columns = [f'pct{race_eth}' for race_eth in RACES_ETHNICITIES] + max_percent_series = percent_df[percent_columns].max(axis=1) + percent_plurality_names_df = ( + percent_df[lambda df: df[this_ethnicity_column] == max_percent_series] + .assign( + obs_of_this_ethnicity=lambda df: df['obs'] * df[this_ethnicity_column] / 100 + ) + .sort_values('obs_of_this_ethnicity', ascending=False) + ) + tzioumis_plurality_names = percent_plurality_names_df.iloc[ + :200 + ].index.values.tolist() + + # Combine these names with the Caliskan+ race+gender names and deduplicate + if race_ethnicity in tzioumis_to_race_gender_mapping: + mapped_ethnicity = tzioumis_to_race_gender_mapping[race_ethnicity] + female_race_gender_name_list = race_gender_name_lists[ + f'{mapped_ethnicity}_female' + ] + if mapped_ethnicity == 'aa': + # Avoid the same name in two lists by removing it from this one + female_race_gender_name_list.remove('Yolanda') + elif mapped_ethnicity == 'his': + # Avoid the same name in two lists by removing it from this one + female_race_gender_name_list.remove('Brenda') + male_race_gender_name_list = race_gender_name_lists[f'{mapped_ethnicity}_male'] + combined_names = ( + tzioumis_plurality_names + + female_race_gender_name_list + + male_race_gender_name_list + ) + else: + combined_names = tzioumis_plurality_names + + # Deduplicate and sort + sorted_names = sorted(list(set(combined_names))) + + print( + f'Using {len(sorted_names):d} names for the {race_ethnicity} race/ethnicity: ' + + ', '.join(sorted_names) + ) + + return sorted_names From b4f423cd348aed12028df89b5d5c5995bd9e3934 Mon Sep 17 00:00:00 2001 From: Eric Smith Date: Fri, 21 Oct 2022 15:53:13 -0400 Subject: [PATCH 2/4] Update util.py --- projects/dialogue_bias/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/dialogue_bias/util.py b/projects/dialogue_bias/util.py index 7d6e33e1621..6e16186d5b6 100644 --- a/projects/dialogue_bias/util.py +++ b/projects/dialogue_bias/util.py @@ -91,7 +91,9 @@ def get_baby_name_counts_by_gender(baby_name_folder: str) -> Dict[str, Dict[str, Return a dictionary whose keys are baby names and whose values are counts of the number of babies given that name, split by gender. - Baby name folder from https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-data, accessed 2021-04-02. + Baby name folder from https://catalog.data.gov/dataset/ + baby-names-from-social-security-card-applications-national-data, + accessed 2021-04-02. """ # Params From a237efb6ccb798f1c5a0a8a6e8bd0abab99fc16c Mon Sep 17 00:00:00 2001 From: Eric Smith Date: Fri, 21 Oct 2022 15:55:25 -0400 Subject: [PATCH 3/4] Update util.py --- projects/dialogue_bias/util.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/projects/dialogue_bias/util.py b/projects/dialogue_bias/util.py index 6e16186d5b6..12d482412ca 100644 --- a/projects/dialogue_bias/util.py +++ b/projects/dialogue_bias/util.py @@ -158,8 +158,10 @@ def get_race_ethnicity_name_list_given_tzioumis_data( ) -> List[str]: """ Given input Tzioumis data (percent_df) and a set of names split by gender and - race/ethnicity from Milkman et al. (2012), Caliskan et al. (2017), and Guo and - Caliskan (2020)., get a list of names for the given race/ethnicity. + race/ethnicity from Milkman et al. + + (2012), Caliskan et al. (2017), and Guo and Caliskan (2020)., get a list of names + for the given race/ethnicity. """ # Params From 6c4106caee3d89c776e3347f65c16101c1fd9ccc Mon Sep 17 00:00:00 2001 From: Eric Smith Date: Mon, 24 Oct 2022 10:46:49 -0400 Subject: [PATCH 4/4] Update util.py --- projects/dialogue_bias/util.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/projects/dialogue_bias/util.py b/projects/dialogue_bias/util.py index 12d482412ca..2dea333fd70 100644 --- a/projects/dialogue_bias/util.py +++ b/projects/dialogue_bias/util.py @@ -12,6 +12,9 @@ RACES_ETHNICITIES = ['hispanic', 'white', 'black', 'api', 'aian', '2prace'] +# Notations for races/ethnicities reflect those used in Tzioumis et al. (see +# https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/TYJKEZ/ +# MPMHFE&version=1.3 for details) RACES_ETHNICITIES_WITH_NAMES = ['hispanic', 'white', 'black', 'api'] # Some races/ethnicities don't have any names on the Tzioumis list for which they are # the plurality race/ethnicity; we exclude those races/ethnicities from this list. @@ -55,6 +58,8 @@ def get_race_ethnicity_gender_name_list( for name in names: proc_name = name.replace('-', '') proc_name = proc_name[0].upper() + proc_name[1:].lower() + # Removing hyphens and changing capitalization to match the formatting of + # the baby-name lists if ( baby_name_counts_by_gender[proc_name]['F'] > baby_name_counts_by_gender[proc_name]['M'] @@ -166,9 +171,11 @@ def get_race_ethnicity_name_list_given_tzioumis_data( # Params tzioumis_to_race_gender_mapping = {'hispanic': 'his', 'white': 'ea', 'black': 'aa'} + num_names_per_race_ethnicity = 200 + # Number of names to select per race/ethnicity, to keep the lists tractable # Determine which names are most commonly of the specified race/ethnicity (i.e. - # plurality), and pick the 200 of those that have the most observations for that + # plurality), and pick those that have the most observations for that # ethnicity this_ethnicity_column = f'pct{race_ethnicity}' percent_columns = [f'pct{race_eth}' for race_eth in RACES_ETHNICITIES] @@ -181,7 +188,7 @@ def get_race_ethnicity_name_list_given_tzioumis_data( .sort_values('obs_of_this_ethnicity', ascending=False) ) tzioumis_plurality_names = percent_plurality_names_df.iloc[ - :200 + :num_names_per_race_ethnicity ].index.values.tolist() # Combine these names with the Caliskan+ race+gender names and deduplicate @@ -196,6 +203,8 @@ def get_race_ethnicity_name_list_given_tzioumis_data( elif mapped_ethnicity == 'his': # Avoid the same name in two lists by removing it from this one female_race_gender_name_list.remove('Brenda') + # TODO: add a programmatic way to detect duplicates, to generalize for updated + # versions of the source datasets male_race_gender_name_list = race_gender_name_lists[f'{mapped_ethnicity}_male'] combined_names = ( tzioumis_plurality_names