Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rescaling linguistic isolation #1750

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 57 additions & 56 deletions data/data-pipeline/data_pipeline/etl/score/etl_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def _add_percentiles_to_df(
df: pd.DataFrame,
input_column_name: str,
output_column_name_root: str,
drop_tracts: list = None,
ascending: bool = True,
) -> pd.DataFrame:
"""Creates percentiles.
Expand Down Expand Up @@ -298,73 +299,39 @@ def _add_percentiles_to_df(
something like "3rd grade reading proficiency" and `output_column_name_root`
may be something like "Low 3rd grade reading proficiency".
"""
if (
output_column_name_root
!= field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
):

# We have two potential options for assessing how to calculate percentiles.
# For the vast majority of columns, we will simply calculate percentiles overall.
# However, for Linguistic Isolation and Agricultural Value Loss, there exist conditions
# for which we drop out tracts from consideration in the percentile. More details on those
# are below, for them, we provide a list of tracts to not include.
# Because of the fancy transformations below, I have removed the urban / rural percentiles,
# which are now deprecated.
if not drop_tracts:
# Create the "basic" percentile.
df[
f"{output_column_name_root}"
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
] = df[input_column_name].rank(pct=True, ascending=ascending)

else:
# For agricultural loss, we are using whether there is value at all to determine percentile and then
# filling places where the value is False with 0
tmp_series = df[input_column_name].where(
~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts),
np.nan,
)
logger.info(
f"Creating special case column for percentiles from {input_column_name}"
)
df[
f"{output_column_name_root}"
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
] = (
df.where(
df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
== 1.0
)[input_column_name]
.rank(ascending=ascending, pct=True)
.fillna(
df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
)
)

# Create the urban/rural percentiles.
urban_rural_percentile_fields_to_combine = []
for (urban_or_rural_string, urban_heuristic_bool) in [
("urban", True),
("rural", False),
]:
# Create a field with only those values
this_category_only_value_field = (
f"{input_column_name} (value {urban_or_rural_string} only)"
)
df[this_category_only_value_field] = np.where(
df[field_names.URBAN_HEURISTIC_FIELD] == urban_heuristic_bool,
df[input_column_name],
None,
)
] = tmp_series.rank(ascending=ascending, pct=True)

# Calculate the percentile for only this category
this_category_only_percentile_field = (
f"{output_column_name_root} "
f"(percentile {urban_or_rural_string} only)"
)
df[this_category_only_percentile_field] = df[
this_category_only_value_field
].rank(
pct=True,
# Set ascending to the parameter value.
ascending=ascending,
)

# Add the field name to this list. Later, we'll combine this list.
urban_rural_percentile_fields_to_combine.append(
this_category_only_percentile_field
)

# Combine both urban and rural into one field:
df[
f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}"
] = df[urban_rural_percentile_fields_to_combine].mean(
axis=1, skipna=True
)
# Check that "drop tracts" were dropped (quicker than creating a fixture?)
assert df[df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)][
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this instead of a formal test -- this sort of test doesn't seem fit nicely within our fixture testing framework, and these sort of assert statements are compatible / match the rest of the codebase.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense! Appreciate the spirit of testing even without a formal testing framework.

f"{output_column_name_root}"
f"{field_names.PERCENTILE_FIELD_SUFFIX}"
].isna().sum() == len(drop_tracts), "Not all tracts were dropped"

return df

Expand Down Expand Up @@ -523,13 +490,47 @@ def _prepare_initial_df(self) -> pd.DataFrame:
df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)

# Convert all columns to numeric and do math
# Note that we have a few special conditions here, that we handle explicitly.
# For *Linguistic Isolation*, we do NOT want to include Puerto Rico in the percentile
# calculation. This is because linguistic isolation as a category doesn't make much sense
# in Puerto Rico, where Spanish is a recognized language. Thus, we construct a list
# of tracts to drop from the percentile calculation.
#
# For *Expected Agricultural Loss*, we only want to include in the percentile tracts
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great comments!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol ty

# in which there is some agricultural value. This helps us adjust the data such that we have
# the ability to discern which tracts truly are at the 90th percentile, since many tracts have 0 value.

for numeric_column in numeric_columns:
drop_tracts = []
if (
numeric_column
== field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
):
drop_tracts = df_copy[
~df_copy[field_names.AGRICULTURAL_VALUE_BOOL_FIELD]
.astype(bool)
.fillna(False)
][field_names.GEOID_TRACT_FIELD].to_list()
logger.info(
f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
)

elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
drop_tracts = df_copy[
# 72 is the FIPS code for Puerto Rico
df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72")
][field_names.GEOID_TRACT_FIELD].to_list()
logger.info(
f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
)

df_copy = self._add_percentiles_to_df(
df=df_copy,
input_column_name=numeric_column,
# For this use case, the input name and output name root are the same.
output_column_name_root=numeric_column,
ascending=True,
drop_tracts=drop_tracts,
)

# Min-max normalization:
Expand Down