From cf785cfa8c479211edfb4a6b84a8d263c363a95b Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 26 Oct 2022 14:00:05 -0400 Subject: [PATCH 1/3] Round ALL the float fields for the tiles (#2033) --- .../data_pipeline/etl/score/constants.py | 69 ------------------ .../data_pipeline/etl/score/etl_score_post.py | 10 ++- .../tests/snapshots/tile_data_expected.pkl | Bin 4442 -> 4271 bytes 3 files changed, 6 insertions(+), 73 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 81e96329e..154b3589a 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -387,72 +387,3 @@ field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "TA_PERC", field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT_DISPLAY: "TA_PERC_FE", } - -# columns to round floats to 2 decimals -# TODO refactor to use much smaller subset of fields we DON'T want to round -TILES_SCORE_FLOAT_COLUMNS = [ - field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.POVERTY_LESS_THAN_100_FPL_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - # Percentiles for Island areas' workforce columns - # To be clear: the island areas pull from 2009 census. PR does not. - field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 - + field_names.ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 - + field_names.ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - # Island areas HS degree attainment rate - field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009, - field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.DOT_TRAVEL_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.FUTURE_FLOOD_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.FUTURE_WILDFIRE_RISK_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME - + field_names.PERCENTILE_FIELD_SUFFIX, - # Include demographic data for sidebar -- as percents, NOT as percentiles. - field_names.PERCENT_BLACK_FIELD_NAME, - field_names.PERCENT_AMERICAN_INDIAN_FIELD_NAME, - field_names.PERCENT_ASIAN_FIELD_NAME, - field_names.PERCENT_HAWAIIAN_FIELD_NAME, - field_names.PERCENT_TWO_OR_MORE_RACES_FIELD_NAME, - field_names.PERCENT_NON_HISPANIC_WHITE_FIELD_NAME, - field_names.PERCENT_HISPANIC_FIELD_NAME, - field_names.PERCENT_OTHER_RACE_FIELD_NAME, - field_names.PERCENT_AGE_UNDER_10, - field_names.PERCENT_AGE_10_TO_64, - field_names.PERCENT_AGE_OVER_64, - # Geojson cannot support nulls in a boolean column when we create tiles; - # to preserve null character, we coerce to floats for all fields - # that use null to signify missing information in a boolean field. - field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME, - field_names.HISTORIC_REDLINING_SCORE_EXCEEDED, - field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT, -] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 0296a9e32..fdb28cbe5 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -231,10 +231,12 @@ def _create_tile_data( score_tiles = score_tiles[ ~score_tiles[field_names.GEOID_TRACT_FIELD].isin(tracts_to_drop) ] - - score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ - constants.TILES_SCORE_FLOAT_COLUMNS - ].apply( + float_cols = [ + col + for col, col_dtype in score_tiles.dtypes.items() + if col_dtype == np.dtype("float64") + ] + score_tiles[float_cols] = score_tiles[float_cols].apply( func=lambda series: floor_series( series=series, number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl index 7ec7b6f8e6af8e9cba78534d0443b45f184d00c3..7d3ef9b9f5009093c195194a0ab79d04ccad7297 100644 GIT binary patch delta 295 zcmcbmv|f?5fo19vfsL$!OpK0`#hJ7inF{>8JeB3Q#5+m9di$ F006*+SSA1f delta 397 zcmZ3lcuR@3fn}<{;6_$KCPs(J;!IkM4U?Ui^d=WEWdQlNfqaI^Y|OhipJYyBbMr(`I$PYIf$ z;mzpH0pxggIy(qJO%|Pen^Rt}6v``u(iM}rSU2nFgT18L89POzhutwJF)1-OaY{yM zkAPQdT3TvRaekhHbAD-FN#&G`vdJ%5-%NhaW;R)$J%{T9#29z`51V(gD=<#J&!NGg z54PB;VlqEx_2fyM5|cM^8Zeqp{>-_Yv3&AoE>Fgm&40PhvM{zyKFx2)*fsete*kM1 M4 Date: Wed, 26 Oct 2022 15:50:40 -0400 Subject: [PATCH 2/3] Floor in a simpler way (#2033) Emma pointed out that all teh stuff we're doing in floor_series is probably unnecessary for this case, so just use the built-in floor. --- .../data_pipeline/etl/score/etl_score_post.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index fdb28cbe5..87fbecda2 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -236,13 +236,10 @@ def _create_tile_data( for col, col_dtype in score_tiles.dtypes.items() if col_dtype == np.dtype("float64") ] - score_tiles[float_cols] = score_tiles[float_cols].apply( - func=lambda series: floor_series( - series=series, - number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, - ), - axis=0, - ) + scale_factor = 10**constants.TILES_ROUND_NUM_DECIMALS + score_tiles[float_cols] = ( + score_tiles[float_cols] * scale_factor + ).apply(np.floor) / scale_factor logger.info("Adding fields for island areas and Puerto Rico") # The below operation constructs variables for the front end. From d574a6eb2e28f2ee4862b65b9c6828a8f89dfa75 Mon Sep 17 00:00:00 2001 From: matt bowen Date: Thu, 27 Oct 2022 10:19:51 -0400 Subject: [PATCH 3/3] Update pickle I missed (#2033) --- .../tests/snapshots/tile_data_expected.pkl | Bin 4271 -> 4439 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl index 7d3ef9b9f5009093c195194a0ab79d04ccad7297..2666b1512f672e131b3617f38c4a8e1ef825efc3 100644 GIT binary patch delta 991 zcmZ3lcwLFLfn}YWUj85nlRwZ0DA@y#hC6Dv@>W4`10I2CD8q^wu7v8)yVn-Fv9~ zDf?=WvZ^ii-$2T~zOa`ED}|y#~9;=51pMe2tKf*~!au6mKDSIpiK^5Dt zg9aPi9Q%#n#P-JB{s7F8@R)rnrvXw9Oj+