From 641eecb3ffb36c0167ecd0b9f2253ed5cd2844ee Mon Sep 17 00:00:00 2001 From: matt bowen Date: Wed, 21 Sep 2022 16:23:00 -0400 Subject: [PATCH] Better document base on Lucas's feedback (#1835) --- data/data-pipeline/README.md | 28 +++++++++++++++---- .../data_pipeline/tests/score/test_output.py | 14 +++++++++- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index d61b17130..7cbf2ed39 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -12,11 +12,14 @@ - [2. Extract-Transform-Load (ETL) the data](#2-extract-transform-load-etl-the-data) - [3. Combined dataset](#3-combined-dataset) - [4. Tileset](#4-tileset) + - [5. Shapefiles](#5-shapefiles) - [Score generation and comparison workflow](#score-generation-and-comparison-workflow) - [Workflow Diagram](#workflow-diagram) - [Step 0: Set up your environment](#step-0-set-up-your-environment) - [Step 1: Run the script to download census data or download from the Justice40 S3 URL](#step-1-run-the-script-to-download-census-data-or-download-from-the-justice40-s3-url) - [Step 2: Run the ETL script for each data source](#step-2-run-the-etl-script-for-each-data-source) + - [Table of commands](#table-of-commands) + - [ETL steps](#etl-steps) - [Step 3: Calculate the Justice40 score experiments](#step-3-calculate-the-justice40-score-experiments) - [Step 4: Compare the Justice40 score experiments to other indices](#step-4-compare-the-justice40-score-experiments-to-other-indices) - [Data Sources](#data-sources) @@ -26,21 +29,27 @@ - [MacOS](#macos) - [Windows Users](#windows-users) - [Setting up Poetry](#setting-up-poetry) - - [Downloading Census Block Groups GeoJSON and Generating CBG CSVs](#downloading-census-block-groups-geojson-and-generating-cbg-csvs) + - [Running tox](#running-tox) + - [The Application entrypoint](#the-application-entrypoint) + - [Downloading Census Block Groups GeoJSON and Generating CBG CSVs (not normally required)](#downloading-census-block-groups-geojson-and-generating-cbg-csvs-not-normally-required) + - [Run all ETL, score and map generation processes](#run-all-etl-score-and-map-generation-processes) + - [Run both ETL and score generation processes](#run-both-etl-and-score-generation-processes) + - [Run all ETL processes](#run-all-etl-processes) - [Generating Map Tiles](#generating-map-tiles) - [Serve the map locally](#serve-the-map-locally) - [Running Jupyter notebooks](#running-jupyter-notebooks) - [Activating variable-enabled Markdown for Jupyter notebooks](#activating-variable-enabled-markdown-for-jupyter-notebooks) - - [Miscellaneous](#miscellaneous) - [Testing](#testing) - [Background](#background) - - [Configuration / Fixtures](#configuration--fixtures) + - [Score and post-processing tests](#score-and-post-processing-tests) - [Updating Pickles](#updating-pickles) - - [Future Enchancements](#future-enchancements) - - [ETL Unit Tests](#etl-unit-tests) + - [Future Enhancements](#future-enhancements) + - [Fixtures used in ETL "snapshot tests"](#fixtures-used-in-etl-snapshot-tests) + - [Other ETL Unit Tests](#other-etl-unit-tests) - [Extract Tests](#extract-tests) - [Transform Tests](#transform-tests) - [Load Tests](#load-tests) + - [Smoketests](#smoketests) @@ -496,3 +505,12 @@ See above [Fixtures](#configuration--fixtures) section for information about whe These make use of [tmp_path_factory](https://docs.pytest.org/en/latest/how-to/tmp_path.html) to create a file-system located under `temp_dir`, and validate whether the correct files are written to the correct locations. Additional future modifications could include the use of Pandera and/or other schema validation tools, and or a more explicit test that the data written to file can be read back in and yield the same dataframe. + +### Smoketests + +To ensure the score and tiles process correctly, there is a suite of "smoke tests" that can be run after the ETL and score data have been run. +These tests are implemented as pytest test, but are skipped by default. To run them. + +1. Generate a full score with `poetry run python3 data_pipeline/application.py score-full-run` +2. Generate the tile data with `poetry run python3 data_pipeline/application.py generate-score-post -s aws` +3. Select the smoke tests for pytest with `poetry run pytest data_pipeline/tests -k smoketest` \ No newline at end of file diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 0945fb9e9..560be486f 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -254,6 +254,15 @@ def test_data_sources( key: value for key, value in locals().items() if key != "final_score_df" } + # For each data source that's injected via the fixtures, do the following: + # * Ensure at least one column from the source shows up in the score + # * Ensure any tracts NOT in the data source are NA/null in the score + # * Ensure the data source doesn't have a large number of tract IDs that are not + # included in the final score, since that implies the source is using 2020 + # tract IDs + # * Verify that the data from the source that's in the final score output + # is the "equal" to the data from the ETL, allowing for the minor + # differences that come from floating point comparisons for data_source_name, data_source in data_sources.items(): final = "final_" df: pd.DataFrame = final_score_df.merge( @@ -275,7 +284,7 @@ def test_data_sources( ), f"No columns from data source show up in final score in source {data_source_name}" # Make sure we have NAs for any tracts in the final data that aren't - # covered in the final data + # included in the data source assert np.all(df[df.MERGE == "left_only"][final_columns].isna()) # Make sure the datasource doesn't have a ton of unmatched tracts, implying it @@ -293,6 +302,7 @@ def test_data_sources( f"Column {final_column} not equal " f"between {data_source_name} and final score" ) + # For non-numeric types, we can use the built-in equals from pandas if df[final_column].dtype in [ np.dtype(object), np.dtype(bool), @@ -301,6 +311,8 @@ def test_data_sources( assert df[final_column].equals( df[data_source_column] ), error_message + # For numeric sources, use np.close so we don't get harmed by + # float equaity weirdness else: assert np.allclose( df[final_column],