From 4a25a28b0ec2905e0e5650897e6a53f6e4a4b57c Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Thu, 25 Aug 2022 10:54:13 -0400 Subject: [PATCH 1/7] just testing that the boolean is preserved on gha --- data/data-pipeline/data_pipeline/score/score_narwhal.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index 5fb2923c3..7e91a6c26 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -496,6 +496,13 @@ def _pollution_factor(self) -> bool: field_names.AML_BOOLEAN ].fillna(False) + logger.info( + f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}" + ) + logger.info( + f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}" + ) + self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[ [ field_names.RMP_PCTILE_THRESHOLD, From 9a2193d1a45d27d6be3de6c47cebbd0b969328c1 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Thu, 25 Aug 2022 16:37:23 -0400 Subject: [PATCH 2/7] checking drop tracts works --- .../tests/score/test_score_narwhal_methods.py | 84 +++++++++++++++ .../data/test_drop_tracts_from_percentile.csv | 101 ++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv diff --git a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py new file mode 100644 index 000000000..cec98318a --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py @@ -0,0 +1,84 @@ +import pandas as pd +import pytest +from data_pipeline.config import settings +import data_pipeline.score.field_names as field_names +from data_pipeline.etl.score.etl_score import ScoreETL +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +@pytest.fixture +def toy_score_df(scope="module"): + return pd.read_csv( + settings.APP_ROOT + / "tests" + / "score" + / "test_utils" + / "data" + / "test_drop_tracts_from_percentile.csv", + dtype={field_names.GEOID_TRACT_FIELD: str}, + ) + + +def _helper_test_dropping_tracts(toy_score_df, drop_tracts): + logger.info(drop_tracts) + test_frame = toy_score_df[ + ~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts) + ] + return_df = ScoreETL._add_percentiles_to_df( + df=toy_score_df, + input_column_name="to_rank", + output_column_name_root="to_rank_auto", + drop_tracts=drop_tracts, + ) + + test_frame = test_frame.assign( + true_rank=test_frame["to_rank"].rank(pct=True) + ) + + check_frame = test_frame.merge( + return_df[ + [ + field_names.GEOID_TRACT_FIELD, + "to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX, + ] + ], + on=[field_names.GEOID_TRACT_FIELD], + ) + + return check_frame["true_rank"].equals( + check_frame["to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX] + ) + + +def test_drop_0_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=[] + ), "Percentile in score fails when we do not drop any tracts" + + +def test_drop_1_tract(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=["1"] + ), "Percentile in score fails when we do drop a single tract" + + +def test_drop_2_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, drop_tracts=["1", "2"] + ), "Percentile in score fails when we drop two tracts" + + +def test_drop_many_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, + drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list()[:5], + ), "Percentile in score fails when we drop many tracts" + + +def test_drop_all_tracts(toy_score_df): + assert _helper_test_dropping_tracts( + toy_score_df, + drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list(), + ), "Percentile in score fails when we drop all tracts" diff --git a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv new file mode 100644 index 000000000..5177546cc --- /dev/null +++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv @@ -0,0 +1,101 @@ +GEOID10_TRACT,to_rank +1,1 +2,2 +3,3 +4,4 +5,5 +6,6 +7,7 +8,8 +9,9 +10,10 +11,11 +12,12 +13,13 +14,14 +15,15 +16,16 +17,17 +18,18 +19,19 +20,20 +21,21 +22,22 +23,23 +24,24 +25,25 +26,26 +27,27 +28,28 +29,29 +30,30 +31,31 +32,32 +33,33 +34,34 +35,35 +36,36 +37,37 +38,38 +39,39 +40,40 +41,41 +42,42 +43,43 +44,44 +45,45 +46,46 +47,47 +48,48 +49,49 +50,50 +51,51 +52,52 +53,53 +54,54 +55,55 +56,56 +57,57 +58,58 +59,59 +60,60 +61,61 +62,62 +63,63 +64,64 +65,65 +66,66 +67,67 +68,68 +69,69 +70,70 +71,71 +72,72 +73,73 +74,74 +75,75 +76,76 +77,77 +78,78 +79,79 +80,80 +81,81 +82,82 +83,83 +84,84 +85,85 +86,86 +87,87 +88,88 +89,89 +90,90 +91,91 +92,92 +93,93 +94,94 +95,95 +96,96 +97,97 +98,98 +99,99 +100,100 From d16d0109a4f71887af5dab43acc65b7afc6435e1 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com> Date: Thu, 25 Aug 2022 16:48:42 -0400 Subject: [PATCH 3/7] OOPS! Old changes persisted --- data/data-pipeline/data_pipeline/score/score_narwhal.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index 7e91a6c26..5fb2923c3 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -496,13 +496,6 @@ def _pollution_factor(self) -> bool: field_names.AML_BOOLEAN ].fillna(False) - logger.info( - f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}" - ) - logger.info( - f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}" - ) - self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[ [ field_names.RMP_PCTILE_THRESHOLD, From b63c465885d203d6e02718fdc9a0c0dd87155c66 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Thu, 25 Aug 2022 17:15:33 -0400 Subject: [PATCH 4/7] adding a check to the agvalue calculation for nri --- .../etl/sources/national_risk_index/etl.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 0b7ff12eb..d1373602d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -153,6 +153,19 @@ def transform(self) -> None: lower=self.AGRIVALUE_LOWER_BOUND ) + ## Check that this clip worked -- that the only place the value has changed is when the clip took effect + base_expectation = ( + disaster_agriculture_sum_series + / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] + ) + assert ( + df_nri[ + df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + != base_expectation + ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() + < self.AGRIVALUE_LOWER_BOUND + ) + # This produces a boolean that is True in the case of non-zero agricultural value df_nri[self.CONTAINS_AGRIVALUE] = ( df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0 From c5244470ed250a1b92d55591c3a1c39e5c8535ed Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Thu, 25 Aug 2022 18:38:22 -0400 Subject: [PATCH 5/7] updated with error messages --- .../data_pipeline/etl/sources/national_risk_index/etl.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index d1373602d..51ffcfa08 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -164,7 +164,12 @@ def transform(self) -> None: != base_expectation ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() < self.AGRIVALUE_LOWER_BOUND - ) + ), "Clipping the agrivalue did not work!" + + assert ( + df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + != base_expectation + ).sum() > 0, "Clipping the agrivalue did nothing!" # This produces a boolean that is True in the case of non-zero agricultural value df_nri[self.CONTAINS_AGRIVALUE] = ( From 15b4f5b61730a546baa77bddec4f00d40dc359f5 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Fri, 26 Aug 2022 10:12:45 -0400 Subject: [PATCH 6/7] updated error message --- .../data_pipeline/etl/sources/national_risk_index/etl.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 51ffcfa08..c6a312c0f 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -163,8 +163,11 @@ def transform(self) -> None: df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] != base_expectation ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() - < self.AGRIVALUE_LOWER_BOUND - ), "Clipping the agrivalue did not work!" + <= self.AGRIVALUE_LOWER_BOUND + ), ( + "Clipping the agrivalue did not work. There are places where the value doesn't " + + "match an unclipped ratio, even where the agrivalue is above the lower bound!" + ) assert ( df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] From d917880135f6639d1728c742caa247e0daa19673 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Wed, 7 Sep 2022 18:11:29 -0400 Subject: [PATCH 7/7] first pass at removing from map --- .../data_pipeline/etl/score/etl_score_geo.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index da02beef7..31eacbe1d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -60,6 +60,7 @@ def __init__(self, data_source: str = None): field_names.GEOID_TRACT_FIELD ] self.GEOMETRY_FIELD_NAME = "geometry" + self.LAND_FIELD_NAME = "ALAND10" # We will adjust this upwards while there is some fractional value # in the score. This is a starting value. @@ -86,13 +87,22 @@ def extract(self) -> None: ) logger.info("Reading US GeoJSON (~6 minutes)") - self.geojson_usa_df = gpd.read_file( + full_geojson_usa_df = gpd.read_file( self.CENSUS_USA_GEOJSON, dtype={self.GEOID_FIELD_NAME: "string"}, - usecols=[self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME], + usecols=[ + self.GEOID_FIELD_NAME, + self.GEOMETRY_FIELD_NAME, + self.LAND_FIELD_NAME, + ], low_memory=False, ) + # We only want to keep tracts to visualize that have non-0 land + self.geojson_usa_df = full_geojson_usa_df[ + full_geojson_usa_df[self.LAND_FIELD_NAME] > 0 + ] + logger.info("Reading score CSV") self.score_usa_df = pd.read_csv( self.TILE_SCORE_CSV,