From 981a36cfa34cdc9f861d2111e215f67db5bc9ad0 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Wed, 17 Aug 2022 11:11:11 -0400 Subject: [PATCH 1/4] first run -- adding NCLD data to the ETL, but not yet to the score --- .../data_pipeline/etl/constants.py | 5 ++ .../etl/score/config/datasets.yml | 34 ++++++++ .../sources/ncld_nature_deprived/README.md | 80 +++++++++++++++++++ .../sources/ncld_nature_deprived/__init__.py | 0 .../etl/sources/ncld_nature_deprived/etl.py | 77 ++++++++++++++++++ 5 files changed, 196 insertions(+) create mode 100644 data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md create mode 100644 data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/__init__.py create mode 100644 data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index 5f71af3aa..cb6a667b9 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -54,6 +54,11 @@ "module_dir": "hud_housing", "class_name": "HudHousingETL", }, + { + "name": "ncld_nature_deprived", + "module_dir": "ncld_nature_deprived", + "class_name": "NatureDeprivedETL", + }, { "name": "census_acs_median_income", "module_dir": "census_acs_median_income", diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index be07d6ccb..6d61e8553 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -251,3 +251,37 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true + + - long_name: "NCLD Lack of Green Space / Nature-Deprived Communities dataset, as compiled by TPL" + short_name: "ncld_nature_deprived" + module_name: "ncld_nature_deprived" + input_geoid_tract_field_name: "GEOID10_TRACT" + load_fields: + - short_name: "ncld_eligible" + df_field_name: "ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME" + long_name: "Does the tract have at least 35 acres in it?" + field_type: bool + include_in_tiles: true + include_in_downloadable_files: true + create_percentile: false + - short_name: "percent_impervious" + df_field_name: "TRACT_PERCENT_IMPERVIOUS_FIELD_NAME" + long_name: "Share of the tract's land area that is covered by impervious surface as a percent" + field_type: percentage + include_in_tiles: true + include_in_downloadable_files: true + create_percentile: true + - short_name: "percent_nonnatural" + df_field_name: "TRACT_PERCENT_NON_NATURAL_FIELD_NAME" + long_name: "Share of the tract's land area that is covered by impervious surface or cropland as a percent" + field_type: percentage + include_in_tiles: true + include_in_downloadable_files: true + create_percentile: true + - short_name: "percent_cropland" + df_field_name: "TRACT_PERCENT_CROPLAND_FIELD_NAME" + long_name: "Share of the tract's land area that is covered by cropland as a percent" + field_type: percentage + include_in_tiles: true + include_in_downloadable_files: true + create_percentile: true \ No newline at end of file diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md new file mode 100644 index 000000000..d8736d54f --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md @@ -0,0 +1,80 @@ +# Nature deprived communities data + +The following dataset was compiled by TPL using NCLD data. We define as: AREA - [CROPLAND] - [IMPERVIOUS SURFACES]. + +## Codebook +- GEOID10 – Census tract ID +- SF – State Name +- CF – County Name +- P200_PFS – Percent of individuals below 200% Federal Poverty Line (from CEJST source data). +- CA_LT20 – Percent higher ed enrollment rate is less than 20% (from CEJST source data). +- TractAcres – Acres of tract calculated from ALAND10 field (area land/meters) in 2010 census tracts. + - CAVEAT: Some census tracts in the CEJST source file extend into open water. ALAND10 area was used to constrain percent calculations (e.g. cropland area) to land only. +- AcresCrops – Acres crops calculated by summing all cells in the NLCD Cropland Data Layer crop classes. +- PctCrops – Formula: AcresCrops/TractAcres*100. +- PctImperv – Mean imperviousness for each census tract. + - CAVEAT: Where tracts extend into open water, mean imperviousness may be underestimated. +- __TO USE__ PctNatural – Formula: 100 – PctCrops – PctImperv. +- PctNat90 – Tract in or below 10th percentile for PctNatural. 1 = True, 0 = False. + - PctNatural 10th percentile = 28.6439% +- ImpOrCrop – If tract >= 90th percentile for PctImperv OR PctCrops. 1 = True, 0 = False. + - PctImperv 90th percentile = 67.4146 % + - PctCrops 90th percentile = 27.8116 % +- LowInAndEd – If tract >= 65th percentile for P200_PFS AND CA_LT20. + - P200_PFS 65th percentile = 64.0% +- NatureDep – ImpOrCrp = 1 AND LowInAndEd = 1. + +We added `GEOID10_TRACT` before converting shapefile to csv. + +## Instructions to recreate + +### Creating Impervious plus Cropland Attributes for Census Tracts + +The Cropland Data Layer and NLCD Impervious layer were too big to put on our OneDrive, but you can download them here: + CDL: https://www.nass.usda.gov/Research_and_Science/Cropland/Release/datasets/2021_30m_cdls.zip + Impervious: https://s3-us-west-2.amazonaws.com/mrlc/nlcd_2019_impervious_l48_20210604.zip + + +#### Crops + +Add an attribute called TractAcres (or similar) to the census tracts to hold a value representing acres covered by the census tract. +Calculate the TractAcres field for each census tract by using the Calculate Geometry tool (set the Property to Area (geodesic), and the Units to Acres). +From the Cropland Data Layer (CDL), extract only the pixels representing crops, using the Extract by Attributes tool in ArcGIS Spatial Analyst toolbox. +a. The attribute table tells you the names of each type of land cover. Since the CDL also contains NLCD classes and empty classes, the actual crop classes must be extracted. +From the crops-only raster extracted from the CDL, run the Reclassify tool to create a binary layer where all crops = 1, and everything else is Null. +Run the Tabulate Area tool: +a. Zone data = census tracts +b. Input raster data = the binary crops layer +c. This will produce a table with the square meters of crops in each census tract contained in an attribute called VALUE_1 +Run the Join Field tool to join the table to the census tracts, with the VALUE_1 field as the Transfer Field, to transfer the VALUE_1 field (square meters of crops) to the census tracts. +Add a field to the census tracts called AcresCrops (or similar) to hold the acreage of crops in each census tract. +Calculate the AcresCrops field by multiplying the VALUE_1 field by 0.000247105 to produce acres of crops in each census tracts. +a. You can delete the VALUE_1 field. +Add a field called PctCrops (or similar) to hold the percent of each census tract occupied by crops. +Calculate the PctCrops field by dividing the AcresCrops field by the TractAcres field, and multiply by 100 to get the percent. +Impervious + +Run the Zonal Statistics as Table tool: +a. Zone data = census tracts +b. Input raster data = impervious data raster layer +c. Statistics type = Mean +d. This will produce a table with the percent of each census tract occupied by impervious surfaces, contained in an attribute called MEAN + +Run the Join Field tool to join the table to the census tracts, with the MEAN field as the Transfer Field, to transfer the MEAN field (percent impervious) to the census tracts. + +Add a field called PctImperv (or similar) to hold the percent impervious value. + +Calculate the PctImperv field by setting it equal to the MEAN field. +a. You can delete the MEAN field. +Combine the Crops and Impervious Data + +Open the census tracts attribute table and add a field called PctNatural (or similar). Calculate this field using this equation: 100 – PctCrops – PctImperv . This produces a value that tells you the percent of each census tract covered in natural land cover. + +Define the census tracts that fall in the 90th percentile of non-natural land cover: +a. Add a field called PctNat90 (or similar) +b. Right-click on the PctNatural field, and click Sort Ascending (lowest PctNatural values on top) +c. Select the top 10 percent of rows after the sort +d. Click on Show Selected Records in the attribute table +e. Calculate the PctNat90 field for the selected records = 1 +f. Clear the selection +g. The rows that now have a value of 1 for PctNat90 are the most lacking for natural land cover, and can be symbolized accordingly in a map diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py new file mode 100644 index 000000000..14d49c522 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py @@ -0,0 +1,77 @@ +# pylint: disable=unsubscriptable-object +# pylint: disable=unsupported-assignment-operation + +import pandas as pd +from data_pipeline.config import settings + +from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + + +class NatureDeprivedETL(ExtractTransformLoad): + """ETL class for the Nature Deprived Communities dataset""" + + NAME = "ncld_nature_deprived" + SOURCE_URL = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip" + ) + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + + # Output score variables (values set on datasets.yml) for linting purposes + ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME: str + TRACT_PERCENT_IMPERVIOUS_FIELD_NAME: str + TRACT_PERCENT_NON_NATURAL_FIELD_NAME: str + TRACT_PERCENT_CROPLAND_FIELD_NAME: str + + def __init__(self): + # define the full path for the input CSV file + self.INPUT_CSV = ( + self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv" + ) + + # this is the main dataframe + self.df: pd.DataFrame + + # Start dataset-specific vars here + self.PERCENT_NATURAL_FIELD_NAME = "PctNatural" + self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv" + self.PERCENT_CROPLAND_FIELD_NAME = "PctCrops" + self.TRACT_ACRES_FIELD_NAME = "TractAcres" + # In order to ensure that tracts with very small Acreage, we want to create an eligibility criterion + # similar to agrivalue. Here, we are ensuring that a tract has at least 35 acres, or is above the 1st percentile + # for area. This does indeed remove tracts from the 90th+ percentile later on + self.TRACT_ACRES_LOWER_BOUND = 35 + + def transform(self) -> None: + """Reads the unzipped data file into memory and applies the following + transformations to prepare it for the load() method: + + - Renames columns as needed + """ + logger.info("Transforming NCLD Data") + + logger.info(self.COLUMNS_TO_KEEP) + + df_ncld: pd.DataFrame = pd.read_csv( + self.INPUT_CSV, + dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, + low_memory=False, + ) + + df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = ( + df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND + ) + df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = ( + 1 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME] + ) + + # Assign the final df to the class' output_df for the load method with rename + self.output_df = df_ncld.rename( + columns={ + self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME, + self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME, + } + ) From 6be3286f49c042e580d5353ef7e96569197697a1 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Wed, 17 Aug 2022 11:42:50 -0400 Subject: [PATCH 2/4] creates new dataset for ncld nature deprived --- .../data_pipeline/etl/score/config/datasets.yml | 7 ------- .../data_pipeline/etl/sources/fsf_flood_risk/etl.py | 1 - .../data_pipeline/etl/sources/fsf_wildfire_risk/etl.py | 2 -- .../etl/sources/ncld_nature_deprived/README.md | 2 +- .../data_pipeline/etl/sources/ncld_nature_deprived/etl.py | 2 -- 5 files changed, 1 insertion(+), 13 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index e099c4eff..f89852c91 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -35,7 +35,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - short_name: "ex_ag_loss" df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME" long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)" @@ -54,7 +53,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - short_name: "ex_bldg_loss" df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME" long_name: "Expected building loss rate (Natural Hazards Risk Index)" @@ -72,7 +70,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - short_name: "has_ag_val" df_field_name: "CONTAINS_AGRIVALUE" long_name: "Contains agricultural value" @@ -168,7 +165,6 @@ datasets: field_type: float include_in_tiles: true include_in_downloadable_files: true - - long_name: "First Street Foundation Flood Risk" short_name: "FSF Flood Risk" module_name: fsf_flood_risk @@ -209,7 +205,6 @@ datasets: include_in_tiles: false include_in_downloadable_files: true create_percentile: true - - long_name: "First Street Foundation Wildfire Risk" short_name: "FSF Wildfire Risk" module_name: fsf_wildfire_risk @@ -250,7 +245,6 @@ datasets: include_in_tiles: false include_in_downloadable_files: true create_percentile: true - - long_name: "DOT Travel Disadvantage Index" short_name: "DOT" module_name: "travel_composite" @@ -263,7 +257,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - long_name: "NCLD Lack of Green Space / Nature-Deprived Communities dataset, as compiled by TPL" short_name: "ncld_nature_deprived" module_name: "ncld_nature_deprived" diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py index 78a4aefa9..9776e8011 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py @@ -48,7 +48,6 @@ def transform(self) -> None: """ logger.info("Transforming National Risk Index Data") - logger.info(self.COLUMNS_TO_KEEP) # read in the unzipped csv data source then rename the # Census Tract column for merging df_fsf_flood_disagg: pd.DataFrame = pd.read_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py index 2d36a079f..a41ce1e31 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py @@ -47,8 +47,6 @@ def transform(self) -> None: - Calculates share of properties at risk, left-clipping number of properties at 250 """ logger.info("Transforming National Risk Index Data") - - logger.info(self.COLUMNS_TO_KEEP) # read in the unzipped csv data source then rename the # Census Tract column for merging df_fsf_fire_disagg: pd.DataFrame = pd.read_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md index d8736d54f..aa1b6e3b5 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md +++ b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md @@ -1,6 +1,6 @@ # Nature deprived communities data -The following dataset was compiled by TPL using NCLD data. We define as: AREA - [CROPLAND] - [IMPERVIOUS SURFACES]. +The following dataset was compiled by TPL (Trust for Public Lands) using NCLD data. We define as: AREA - [CROPLAND] - [IMPERVIOUS SURFACES]. ## Codebook - GEOID10 – Census tract ID diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py index 14d49c522..c294ec4be 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py @@ -53,8 +53,6 @@ def transform(self) -> None: """ logger.info("Transforming NCLD Data") - logger.info(self.COLUMNS_TO_KEEP) - df_ncld: pd.DataFrame = pd.read_csv( self.INPUT_CSV, dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, From 07a1cbd0b8fc99bdfc770b3a6a8a283f587c8c15 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Wed, 17 Aug 2022 11:42:50 -0400 Subject: [PATCH 3/4] adding is_memory_intensive flag --- data/data-pipeline/data_pipeline/etl/constants.py | 1 + .../data_pipeline/etl/score/config/datasets.yml | 7 ------- .../data_pipeline/etl/sources/fsf_flood_risk/etl.py | 1 - .../data_pipeline/etl/sources/fsf_wildfire_risk/etl.py | 2 -- .../etl/sources/ncld_nature_deprived/README.md | 2 +- .../data_pipeline/etl/sources/ncld_nature_deprived/etl.py | 2 -- 6 files changed, 2 insertions(+), 13 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index 41e9671d8..a6140af59 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -69,6 +69,7 @@ "name": "ncld_nature_deprived", "module_dir": "ncld_nature_deprived", "class_name": "NatureDeprivedETL", + "is_memory_intensive": False, }, { "name": "census_acs_median_income", diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index e099c4eff..f89852c91 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -35,7 +35,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - short_name: "ex_ag_loss" df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME" long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)" @@ -54,7 +53,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - short_name: "ex_bldg_loss" df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME" long_name: "Expected building loss rate (Natural Hazards Risk Index)" @@ -72,7 +70,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - short_name: "has_ag_val" df_field_name: "CONTAINS_AGRIVALUE" long_name: "Contains agricultural value" @@ -168,7 +165,6 @@ datasets: field_type: float include_in_tiles: true include_in_downloadable_files: true - - long_name: "First Street Foundation Flood Risk" short_name: "FSF Flood Risk" module_name: fsf_flood_risk @@ -209,7 +205,6 @@ datasets: include_in_tiles: false include_in_downloadable_files: true create_percentile: true - - long_name: "First Street Foundation Wildfire Risk" short_name: "FSF Wildfire Risk" module_name: fsf_wildfire_risk @@ -250,7 +245,6 @@ datasets: include_in_tiles: false include_in_downloadable_files: true create_percentile: true - - long_name: "DOT Travel Disadvantage Index" short_name: "DOT" module_name: "travel_composite" @@ -263,7 +257,6 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - long_name: "NCLD Lack of Green Space / Nature-Deprived Communities dataset, as compiled by TPL" short_name: "ncld_nature_deprived" module_name: "ncld_nature_deprived" diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py index 78a4aefa9..9776e8011 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py @@ -48,7 +48,6 @@ def transform(self) -> None: """ logger.info("Transforming National Risk Index Data") - logger.info(self.COLUMNS_TO_KEEP) # read in the unzipped csv data source then rename the # Census Tract column for merging df_fsf_flood_disagg: pd.DataFrame = pd.read_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py index 2d36a079f..a41ce1e31 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py @@ -47,8 +47,6 @@ def transform(self) -> None: - Calculates share of properties at risk, left-clipping number of properties at 250 """ logger.info("Transforming National Risk Index Data") - - logger.info(self.COLUMNS_TO_KEEP) # read in the unzipped csv data source then rename the # Census Tract column for merging df_fsf_fire_disagg: pd.DataFrame = pd.read_csv( diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md index d8736d54f..aa1b6e3b5 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md +++ b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md @@ -1,6 +1,6 @@ # Nature deprived communities data -The following dataset was compiled by TPL using NCLD data. We define as: AREA - [CROPLAND] - [IMPERVIOUS SURFACES]. +The following dataset was compiled by TPL (Trust for Public Lands) using NCLD data. We define as: AREA - [CROPLAND] - [IMPERVIOUS SURFACES]. ## Codebook - GEOID10 – Census tract ID diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py index 14d49c522..c294ec4be 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py @@ -53,8 +53,6 @@ def transform(self) -> None: """ logger.info("Transforming NCLD Data") - logger.info(self.COLUMNS_TO_KEEP) - df_ncld: pd.DataFrame = pd.read_csv( self.INPUT_CSV, dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, From d45bb87c513dc1dd7d4f0924302bd360e33e27d9 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin Date: Wed, 17 Aug 2022 13:56:07 -0400 Subject: [PATCH 4/4] updated with pr comments --- .../data_pipeline/etl/constants.py | 4 ++-- .../etl/score/config/datasets.yml | 6 +++--- .../data_pipeline/etl/score/etl_score.py | 18 +++--------------- .../README.md | 0 .../__init__.py | 0 .../etl.py | 4 ++-- 6 files changed, 10 insertions(+), 22 deletions(-) rename data/data-pipeline/data_pipeline/etl/sources/{ncld_nature_deprived => nlcd_nature_deprived}/README.md (100%) rename data/data-pipeline/data_pipeline/etl/sources/{ncld_nature_deprived => nlcd_nature_deprived}/__init__.py (100%) rename data/data-pipeline/data_pipeline/etl/sources/{ncld_nature_deprived => nlcd_nature_deprived}/etl.py (97%) diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index a6140af59..7d76b4f05 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -66,8 +66,8 @@ "is_memory_intensive": False, }, { - "name": "ncld_nature_deprived", - "module_dir": "ncld_nature_deprived", + "name": "nlcd_nature_deprived", + "module_dir": "nlcd_nature_deprived", "class_name": "NatureDeprivedETL", "is_memory_intensive": False, }, diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index f89852c91..dc06b4f0f 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -257,9 +257,9 @@ datasets: include_in_tiles: true include_in_downloadable_files: true create_percentile: true - - long_name: "NCLD Lack of Green Space / Nature-Deprived Communities dataset, as compiled by TPL" - short_name: "ncld_nature_deprived" - module_name: "ncld_nature_deprived" + - long_name: "National Land Cover Database (NLCD) Lack of Green Space / Nature-Deprived Communities dataset, as compiled by TPL" + short_name: "nlcd_nature_deprived" + module_name: "nlcd_nature_deprived" input_geoid_tract_field_name: "GEOID10_TRACT" load_fields: - short_name: "ncld_eligible" diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 3b8c45d26..0d942d5c0 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -14,7 +14,7 @@ from data_pipeline.etl.sources.fsf_flood_risk.etl import ( FloodRiskETL, ) -from data_pipeline.etl.sources.ncld_nature_deprived.etl import NatureDeprivedETL +from data_pipeline.etl.sources.nlcd_nature_deprived.etl import NatureDeprivedETL from data_pipeline.etl.sources.fsf_wildfire_risk.etl import WildfireRiskETL from data_pipeline.score.score_runner import ScoreRunner from data_pipeline.score import field_names @@ -136,7 +136,7 @@ def extract(self) -> None: # Load flood risk data self.fsf_flood_df = FloodRiskETL.get_data_frame() - # Load NCLD Nature-Deprived Communities data + # Load NLCD Nature-Deprived Communities data self.nature_deprived_df = NatureDeprivedETL.get_data_frame() # Load GeoCorr Urban Rural Map @@ -519,7 +519,7 @@ def _prepare_initial_df(self) -> pd.DataFrame: # the ability to discern which tracts truly are at the 90th percentile, since many tracts have 0 value. # # For *Non-Natural Space*, we may only want to include tracts that have at least 35 acreas, I think. This will - # get rid of tracts that we think are aberrations statistically. Right now, we have commented this out + # get rid of tracts that we think are aberrations statistically. Right now, we have left this out # pending ground-truthing. for numeric_column in numeric_columns: @@ -544,18 +544,6 @@ def _prepare_initial_df(self) -> pd.DataFrame: logger.info( f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation" ) - # elif ( - # numeric_column - # == field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME - # ): - # drop_tracts = df_copy[ - # df_copy[field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD] - # .astype(bool) - # .fillna(False) - # ][field_names.GEOID_TRACT_FIELD].to_list() - # logger.info( - # f"Dropping {len(drop_tracts)} tracts from non-natural space indicator" - # ) df_copy = self._add_percentiles_to_df( df=df_copy, diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/README.md similarity index 100% rename from data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/README.md rename to data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/README.md diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/__init__.py similarity index 100% rename from data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/__init__.py rename to data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/__init__.py diff --git a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py similarity index 97% rename from data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py rename to data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py index c294ec4be..e9951da24 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ncld_nature_deprived/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py @@ -13,7 +13,7 @@ class NatureDeprivedETL(ExtractTransformLoad): """ETL class for the Nature Deprived Communities dataset""" - NAME = "ncld_nature_deprived" + NAME = "nlcd_nature_deprived" SOURCE_URL = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip" @@ -51,7 +51,7 @@ def transform(self) -> None: - Renames columns as needed """ - logger.info("Transforming NCLD Data") + logger.info("Transforming NLCD Data") df_ncld: pd.DataFrame = pd.read_csv( self.INPUT_CSV,