From 4a25a28b0ec2905e0e5650897e6a53f6e4a4b57c Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <Emma.J.Nechamkin@omb.eop.gov>
Date: Thu, 25 Aug 2022 10:54:13 -0400
Subject: [PATCH 1/7] just testing that the boolean is preserved on gha

---
 data/data-pipeline/data_pipeline/score/score_narwhal.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py
index 5fb2923c3..7e91a6c26 100644
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@@ -496,6 +496,13 @@ def _pollution_factor(self) -> bool:
             field_names.AML_BOOLEAN
         ].fillna(False)
 
+        logger.info(
+            f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}"
+        )
+        logger.info(
+            f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}"
+        )
+
         self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[
             [
                 field_names.RMP_PCTILE_THRESHOLD,

From 9a2193d1a45d27d6be3de6c47cebbd0b969328c1 Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <Emma.J.Nechamkin@omb.eop.gov>
Date: Thu, 25 Aug 2022 16:37:23 -0400
Subject: [PATCH 2/7] checking drop tracts works

---
 .../tests/score/test_score_narwhal_methods.py |  84 +++++++++++++++
 .../data/test_drop_tracts_from_percentile.csv | 101 ++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
 create mode 100644 data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv

diff --git a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
new file mode 100644
index 000000000..cec98318a
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
@@ -0,0 +1,84 @@
+import pandas as pd
+import pytest
+from data_pipeline.config import settings
+import data_pipeline.score.field_names as field_names
+from data_pipeline.etl.score.etl_score import ScoreETL
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+@pytest.fixture
+def toy_score_df(scope="module"):
+    return pd.read_csv(
+        settings.APP_ROOT
+        / "tests"
+        / "score"
+        / "test_utils"
+        / "data"
+        / "test_drop_tracts_from_percentile.csv",
+        dtype={field_names.GEOID_TRACT_FIELD: str},
+    )
+
+
+def _helper_test_dropping_tracts(toy_score_df, drop_tracts):
+    logger.info(drop_tracts)
+    test_frame = toy_score_df[
+        ~toy_score_df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)
+    ]
+    return_df = ScoreETL._add_percentiles_to_df(
+        df=toy_score_df,
+        input_column_name="to_rank",
+        output_column_name_root="to_rank_auto",
+        drop_tracts=drop_tracts,
+    )
+
+    test_frame = test_frame.assign(
+        true_rank=test_frame["to_rank"].rank(pct=True)
+    )
+
+    check_frame = test_frame.merge(
+        return_df[
+            [
+                field_names.GEOID_TRACT_FIELD,
+                "to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX,
+            ]
+        ],
+        on=[field_names.GEOID_TRACT_FIELD],
+    )
+
+    return check_frame["true_rank"].equals(
+        check_frame["to_rank_auto" + field_names.PERCENTILE_FIELD_SUFFIX]
+    )
+
+
+def test_drop_0_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df, drop_tracts=[]
+    ), "Percentile in score fails when we do not drop any tracts"
+
+
+def test_drop_1_tract(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df, drop_tracts=["1"]
+    ), "Percentile in score fails when we do drop a single tract"
+
+
+def test_drop_2_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df, drop_tracts=["1", "2"]
+    ), "Percentile in score fails when we drop two tracts"
+
+
+def test_drop_many_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df,
+        drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list()[:5],
+    ), "Percentile in score fails when we drop many tracts"
+
+
+def test_drop_all_tracts(toy_score_df):
+    assert _helper_test_dropping_tracts(
+        toy_score_df,
+        drop_tracts=toy_score_df[field_names.GEOID_TRACT_FIELD].to_list(),
+    ), "Percentile in score fails when we drop all tracts"
diff --git a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv
new file mode 100644
index 000000000..5177546cc
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/test_drop_tracts_from_percentile.csv
@@ -0,0 +1,101 @@
+GEOID10_TRACT,to_rank
+1,1
+2,2
+3,3
+4,4
+5,5
+6,6
+7,7
+8,8
+9,9
+10,10
+11,11
+12,12
+13,13
+14,14
+15,15
+16,16
+17,17
+18,18
+19,19
+20,20
+21,21
+22,22
+23,23
+24,24
+25,25
+26,26
+27,27
+28,28
+29,29
+30,30
+31,31
+32,32
+33,33
+34,34
+35,35
+36,36
+37,37
+38,38
+39,39
+40,40
+41,41
+42,42
+43,43
+44,44
+45,45
+46,46
+47,47
+48,48
+49,49
+50,50
+51,51
+52,52
+53,53
+54,54
+55,55
+56,56
+57,57
+58,58
+59,59
+60,60
+61,61
+62,62
+63,63
+64,64
+65,65
+66,66
+67,67
+68,68
+69,69
+70,70
+71,71
+72,72
+73,73
+74,74
+75,75
+76,76
+77,77
+78,78
+79,79
+80,80
+81,81
+82,82
+83,83
+84,84
+85,85
+86,86
+87,87
+88,88
+89,89
+90,90
+91,91
+92,92
+93,93
+94,94
+95,95
+96,96
+97,97
+98,98
+99,99
+100,100

From d16d0109a4f71887af5dab43acc65b7afc6435e1 Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com>
Date: Thu, 25 Aug 2022 16:48:42 -0400
Subject: [PATCH 3/7] OOPS!

Old changes persisted
---
 data/data-pipeline/data_pipeline/score/score_narwhal.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py
index 7e91a6c26..5fb2923c3 100644
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@@ -496,13 +496,6 @@ def _pollution_factor(self) -> bool:
             field_names.AML_BOOLEAN
         ].fillna(False)
 
-        logger.info(
-            f"{ self.df[field_names.AML_BOOLEAN_FILLED_IN].value_counts(dropna=False)}"
-        )
-        logger.info(
-            f"{ self.df[field_names.AML_BOOLEAN].value_counts(dropna=False)}"
-        )
-
         self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[
             [
                 field_names.RMP_PCTILE_THRESHOLD,

From b63c465885d203d6e02718fdc9a0c0dd87155c66 Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <Emma.J.Nechamkin@omb.eop.gov>
Date: Thu, 25 Aug 2022 17:15:33 -0400
Subject: [PATCH 4/7] adding a check to the agvalue calculation for nri

---
 .../etl/sources/national_risk_index/etl.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
index 0b7ff12eb..d1373602d 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@@ -153,6 +153,19 @@ def transform(self) -> None:
             lower=self.AGRIVALUE_LOWER_BOUND
         )
 
+        ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
+        base_expectation = (
+            disaster_agriculture_sum_series
+            / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
+        )
+        assert (
+            df_nri[
+                df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+                != base_expectation
+            ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
+            < self.AGRIVALUE_LOWER_BOUND
+        )
+
         # This produces a boolean that is True in the case of non-zero agricultural value
         df_nri[self.CONTAINS_AGRIVALUE] = (
             df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0

From c5244470ed250a1b92d55591c3a1c39e5c8535ed Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <Emma.J.Nechamkin@omb.eop.gov>
Date: Thu, 25 Aug 2022 18:38:22 -0400
Subject: [PATCH 5/7] updated with error messages

---
 .../data_pipeline/etl/sources/national_risk_index/etl.py   | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
index d1373602d..51ffcfa08 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@@ -164,7 +164,12 @@ def transform(self) -> None:
                 != base_expectation
             ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
             < self.AGRIVALUE_LOWER_BOUND
-        )
+        ), "Clipping the agrivalue did not work!"
+
+        assert (
+            df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            != base_expectation
+        ).sum() > 0, "Clipping the agrivalue did nothing!"
 
         # This produces a boolean that is True in the case of non-zero agricultural value
         df_nri[self.CONTAINS_AGRIVALUE] = (

From 15b4f5b61730a546baa77bddec4f00d40dc359f5 Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <Emma.J.Nechamkin@omb.eop.gov>
Date: Fri, 26 Aug 2022 10:12:45 -0400
Subject: [PATCH 6/7] updated error message

---
 .../data_pipeline/etl/sources/national_risk_index/etl.py   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
index 51ffcfa08..c6a312c0f 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@@ -163,8 +163,11 @@ def transform(self) -> None:
                 df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
                 != base_expectation
             ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
-            < self.AGRIVALUE_LOWER_BOUND
-        ), "Clipping the agrivalue did not work!"
+            <= self.AGRIVALUE_LOWER_BOUND
+        ), (
+            "Clipping the agrivalue did not work. There are places where the value doesn't "
+            + "match an unclipped ratio, even where the agrivalue is above the lower bound!"
+        )
 
         assert (
             df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]

From d917880135f6639d1728c742caa247e0daa19673 Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <Emma.J.Nechamkin@omb.eop.gov>
Date: Wed, 7 Sep 2022 18:11:29 -0400
Subject: [PATCH 7/7] first pass at removing from map

---
 .../data_pipeline/etl/score/etl_score_geo.py       | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
index da02beef7..31eacbe1d 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@@ -60,6 +60,7 @@ def __init__(self, data_source: str = None):
             field_names.GEOID_TRACT_FIELD
         ]
         self.GEOMETRY_FIELD_NAME = "geometry"
+        self.LAND_FIELD_NAME = "ALAND10"
 
         # We will adjust this upwards while there is some fractional value
         # in the score. This is a starting value.
@@ -86,13 +87,22 @@ def extract(self) -> None:
         )
 
         logger.info("Reading US GeoJSON (~6 minutes)")
-        self.geojson_usa_df = gpd.read_file(
+        full_geojson_usa_df = gpd.read_file(
             self.CENSUS_USA_GEOJSON,
             dtype={self.GEOID_FIELD_NAME: "string"},
-            usecols=[self.GEOID_FIELD_NAME, self.GEOMETRY_FIELD_NAME],
+            usecols=[
+                self.GEOID_FIELD_NAME,
+                self.GEOMETRY_FIELD_NAME,
+                self.LAND_FIELD_NAME,
+            ],
             low_memory=False,
         )
 
+        # We only want to keep tracts to visualize that have non-0 land
+        self.geojson_usa_df = full_geojson_usa_df[
+            full_geojson_usa_df[self.LAND_FIELD_NAME] > 0
+        ]
+
         logger.info("Reading score CSV")
         self.score_usa_df = pd.read_csv(
             self.TILE_SCORE_CSV,