diff --git a/ansible/templates/covid_act_now-params-prod.json.j2 b/ansible/templates/covid_act_now-params-prod.json.j2 deleted file mode 100644 index c1821a540..000000000 --- a/ansible/templates/covid_act_now-params-prod.json.j2 +++ /dev/null @@ -1,51 +0,0 @@ -{ - "common": { - "export_dir": "./receiving", - "log_filename": "/var/log/indicators/covid_act_now.log" - }, - "indicator": { - "parquet_url": "https://storage.googleapis.com/can-scrape-outputs/final/can_scrape_api_covid_us.parquet" - }, - "archive": { - "cache_dir": "./cache", - "bucket_name": "delphi-covidcast-indicator-output", - "indicator_prefix": "CAN", - "aws_credentials": { - "aws_access_key_id": "{{ delphi_aws_access_key_id }}", - "aws_secret_access_key": "{{ delphi_aws_secret_access_key }}" - } - }, - "validation": { - "common": { - "data_source": "covid-act-now", - "span_length": 14, - "min_expected_lag": {"all": "3"}, - "max_expected_lag": {"all": "9"}, - "dry_run": true, - "suppressed_errors": [ - {"check_name": "check_se_many_missing", - "signal": "pcr_specimen_total_tests"}, - {"check_name": "check_se_not_missing_and_in_range", - "signal": "pcr_specimen_total_tests"}, - {"check_name": "check_n_missing", - "signal": "pcr_specimen_total_tests"}, - {"check_name": "check_se_0_when_val_0"}, - {"check_name": "check_test_vs_reference_avg_changed", - "signal": "pcr_specimen_positivity_rate"} - ] - }, - "static": { - "minimum_sample_size": 0, - "missing_se_allowed": false, - "missing_sample_size_allowed": false - }, - "dynamic": { - "ref_window_size": 7, - "smoothed_signals": [ - ] - } - }, - "delivery": { - "delivery_dir": "/common/covidcast/receiving/covid-act-now" - } -} diff --git a/covid_act_now/.pylintrc b/covid_act_now/.pylintrc deleted file mode 100644 index f30837c7e..000000000 --- a/covid_act_now/.pylintrc +++ /dev/null @@ -1,22 +0,0 @@ - -[MESSAGES CONTROL] - -disable=logging-format-interpolation, - too-many-locals, - too-many-arguments, - # Allow pytest functions to be part of a class. - no-self-use, - # Allow pytest classes to have one test. - too-few-public-methods - -[BASIC] - -# Allow arbitrarily short-named variables. -variable-rgx=[a-z_][a-z0-9_]* -argument-rgx=[a-z_][a-z0-9_]* -attr-rgx=[a-z_][a-z0-9_]* - -[DESIGN] - -# Don't complain about pytest "unused" arguments. -ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/covid_act_now/Makefile b/covid_act_now/Makefile deleted file mode 100644 index bc88f1fec..000000000 --- a/covid_act_now/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -.PHONY = venv, lint, test, clean - -dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*' | head -1) -venv: - python3.8 -m venv env - -install: venv - . env/bin/activate; \ - pip install wheel ; \ - pip install -e ../_delphi_utils_python ;\ - pip install -e . - -install-ci: venv - . env/bin/activate; \ - pip install wheel ; \ - pip install ../_delphi_utils_python ;\ - pip install . - -lint: - . env/bin/activate; pylint $(dir) - . env/bin/activate; pydocstyle $(dir) - -test: - . env/bin/activate ;\ - (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) - -clean: - rm -rf env - rm -f params.json diff --git a/covid_act_now/README.md b/covid_act_now/README.md deleted file mode 100644 index ff16f06ea..000000000 --- a/covid_act_now/README.md +++ /dev/null @@ -1,61 +0,0 @@ -Covid Act Now (CAN) provides several testing metrics at the county and state level from various sources. -This indicator extracts only the county level PCR and specimen-based metrics sourced from the [CDC](https://covid.cdc.gov/covid-data-tracker/#county-view) and additionally aggregates them to state, MSA, HRR, HHS and national levels. - -## Running the Indicator - -The indicator is run by directly executing the Python module contained in this -directory. The safest way to do this is to create a virtual environment, -installed the common DELPHI tools, and then install the module and its -dependencies. To do this, run the following command from this directory: - -``` -make install -``` - -This command will install the package in editable mode, so you can make changes that -will automatically propagate to the installed package. - -All of the user-changable parameters are stored in `params.json`. To execute -the module and produce the output datasets (by default, in `receiving`), run -the following: - -``` -env/bin/python -m delphi_covid_act_now -``` - -If you want to enter the virtual environment in your shell, -you can run `source env/bin/activate`. Run `deactivate` to leave the virtual environment. - -Once you are finished, you can remove the virtual environment and -params file with the following: - -``` -make clean -``` - -## Testing the code - -To run static tests of the code style, run the following command: - -``` -make lint -``` - -Unit tests are also included in the module. To execute these, run the following -command from this directory: - -``` -make test -``` - -To run individual tests, run the following: - -``` -(cd tests && ../env/bin/pytest .py --cov=delphi_covid_act_now --cov-report=term-missing) -``` - -The output will show the number of unit tests that passed and failed, along -with the percentage of code covered by the tests. - -None of the linting or unit tests should fail, and the code lines that are not covered by unit tests should be small and -should not include critical sub-routines. diff --git a/covid_act_now/REVIEW.md b/covid_act_now/REVIEW.md deleted file mode 100644 index 93a5a6579..000000000 --- a/covid_act_now/REVIEW.md +++ /dev/null @@ -1,39 +0,0 @@ -## Code Review (Python) - -A code review of this module should include a careful look at the code and the -output. To assist in the process, but certainly not in replace of it, please -check the following items. - -**Documentation** - -- [ ] the README.md file template is filled out and currently accurate; it is -possible to load and test the code using only the instructions given -- [ ] minimal docstrings (one line describing what the function does) are -included for all functions; full docstrings describing the inputs and expected -outputs should be given for non-trivial functions - -**Structure** - -- [ ] code should use 4 spaces for indentation; other style decisions are -flexible, but be consistent within a module -- [ ] any required metadata files are checked into the repository and placed -within the directory `static` -- [ ] any intermediate files that are created and stored by the module should -be placed in the directory `cache` -- [ ] final expected output files to be uploaded to the API are placed in the -`receiving` directory; output files should not be committed to the respository -- [ ] all options and API keys are passed through the file `params.json` -- [ ] template parameter file (`params.json.template`) is checked into the -code; no personal (i.e., usernames) or private (i.e., API keys) information is -included in this template file - -**Testing** - -- [ ] module can be installed in a new virtual environment -- [ ] pylint with the default `.pylint` settings run over the module produces -minimal warnings; warnings that do exist have been confirmed as false positives -- [ ] reasonably high level of unit test coverage covering all of the main logic -of the code (e.g., missing coverage for raised errors that do not currently seem -possible to reach are okay; missing coverage for options that will be needed are -not) -- [ ] all unit tests run without errors diff --git a/covid_act_now/cache/.gitignore b/covid_act_now/cache/.gitignore deleted file mode 100644 index e69de29bb..000000000 diff --git a/covid_act_now/delphi_covid_act_now/__init__.py b/covid_act_now/delphi_covid_act_now/__init__.py deleted file mode 100644 index 800a750a0..000000000 --- a/covid_act_now/delphi_covid_act_now/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -"""Module to pull and clean indicators from Covid Act Now. - -This file defines the functions that are made public by the module. As the -module is intended to be executed though the main method, these are primarily -for testing. -""" - -from __future__ import absolute_import - -from . import run - -__version__ = "0.1.0" diff --git a/covid_act_now/delphi_covid_act_now/__main__.py b/covid_act_now/delphi_covid_act_now/__main__.py deleted file mode 100644 index e79d2ba36..000000000 --- a/covid_act_now/delphi_covid_act_now/__main__.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- -"""Call the function run_module when executed. - -This file indicates that calling the module (`python -m delphi_covid_act_now`) will -call the function `run_module` found within the run.py file. There should be -no need to change this template. -""" -from delphi_utils import read_params -from .run import run_module # pragma: no cover - -run_module(read_params()) # pragma: no cover diff --git a/covid_act_now/delphi_covid_act_now/constants.py b/covid_act_now/delphi_covid_act_now/constants.py deleted file mode 100644 index eb26e4460..000000000 --- a/covid_act_now/delphi_covid_act_now/constants.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Registry for signal names.""" - -GEO_RESOLUTIONS = [ - "county", - "state", - "msa", - "hrr", - "hhs", - "nation", -] - -SIGNALS = [ - "pcr_specimen_positivity_rate", - "pcr_specimen_total_tests", -] diff --git a/covid_act_now/delphi_covid_act_now/geo.py b/covid_act_now/delphi_covid_act_now/geo.py deleted file mode 100644 index 691ba9fe7..000000000 --- a/covid_act_now/delphi_covid_act_now/geo.py +++ /dev/null @@ -1,101 +0,0 @@ -"""Geo-aggregation related functions.""" - -import numpy as np -import pandas as pd - -from delphi_utils import GeoMapper - -from .constants import GEO_RESOLUTIONS - -gmpr = GeoMapper() - -def positivity_rate(x): - """ - Find Positivity Rate from binomial counts. - - Assumes input sample_size are all > 0. - - Parameters - ---------- - x: pd.DataFrame - Columns: pcr_tests_positive, sample_size, ... - - Returns - ------- - pd.Series - Positivity Rate of PCR-specimen tests. - """ - p = x.pcr_tests_positive / x.sample_size - - return p - -def std_err(x): - """ - Find Standard Error of a binomial proportion. - - Assumes input sample_size are all > 0. - - Parameters - ---------- - x: pd.DataFrame - Columns: val, sample_size, ... - - Returns - ------- - pd.Series - Standard error of the positivity rate of PCR-specimen tests. - """ - p = x.val - n = x.sample_size - return np.sqrt(p * (1 - p) / n) - -def geo_map(df: pd.DataFrame, geo_res: str) -> pd.DataFrame: - """ - Aggregate county-level PCR testing metrics to other geographical levels specified by `geo_res`. - - Parameters - ---------- - df: pd.DataFrame - Columns: fips, timestamp, pcr_tests_positive, pcr_tests_total, ... - geo_res: str - Geographic resolution to which to aggregate. Valid options: - ("county", "state", "msa", "hrr", "hhs", "nation"). - - Returns - ------- - pd.DataFrame - Dataframe where val is positivity rate and sample_size is total tests. - Columns: geo_id, timestamp, val, sample_size, se - """ - if geo_res not in GEO_RESOLUTIONS: - raise ValueError(f"geo_res must be one of {GEO_RESOLUTIONS}, got '{geo_res}'") - - if (df.pcr_tests_positive > df.pcr_tests_total).any(): - raise ValueError("Found some test positive count greater than the total") - - if (df.pcr_tests_total <= 0).any(): - raise ValueError("Found some test total <= 0") - - if geo_res == "county": - df = (df - .rename(columns={ - "fips": "geo_id", - "pcr_positivity_rate": "val", - "pcr_tests_total": "sample_size"}) - .assign(se=std_err) - ) - - else: - # All other geo_res can be used directly with GeoMapper - if geo_res == "state": - geo_res = "state_id" - - df = (df - .loc[:, ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total"]] - .pipe(gmpr.replace_geocode, "fips", geo_res, new_col="geo_id") - .rename(columns={"pcr_tests_total": "sample_size"}) - .assign(val=positivity_rate, se=std_err) - .reset_index() - ) - - return df diff --git a/covid_act_now/delphi_covid_act_now/pull.py b/covid_act_now/delphi_covid_act_now/pull.py deleted file mode 100644 index 1a694568f..000000000 --- a/covid_act_now/delphi_covid_act_now/pull.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Functions for downloading CAN data.""" - -import pandas as pd - -RENAME_COLS = { - "dt": "timestamp", - "location": "fips", -} - -def load_data(path: str) -> pd.DataFrame: - """ - Load CAN's data from a local or online parquet file. - - Some important columns are: - - provider: Source of the data - - location_type: State or county level data - - variable_name: Name of available metrics, like pcr_tests_* - - This function also formats and renames the geo and time columns to follow our conventions. - - Parameters - ---------- - path: str - A local path or URL to CAN's parquet file to load from - - Returns - ------- - pd.DataFrame - CAN's data in long format - """ - df_pq = (pd - .read_parquet(path) - .rename(columns=RENAME_COLS) - ) - - # Format fips - df_pq["fips"] = df_pq["fips"].astype(str).str.zfill(5) - - return df_pq - -def extract_testing_metrics(df: pd.DataFrame) -> pd.DataFrame: - """ - Extract just the county-level testing metrics from CAN's data. - - Specifically picks the CDC-sourced metrics only as they are confirmed to be PCR-specimen-based. - Also converts from long to wide format for easier aggregations later on. - - Note that the CDC's metrics are already smoothed (7-day rolling averaged). - - Parameters - ---------- - df: pd.DataFrame - CAN's data in long format - - Returns - ------- - pd.DataFrame - CAN's / CDC's testing data in wide format - Columns: fips, timestamp, pcr_positivity_rate, pcr_tests_positive, pcr_tests_total - """ - # Filter to PCR-specimen rows from CDC and convert from long to wide format - df_tests = ( - df - .query( - """ - age == 'all' and ethnicity == 'all' and sex == 'all' and \ - location_type == 'county' and provider == 'cdc' and \ - variable_name.str.startswith('pcr_tests_') - """) - .pivot(index=["fips", "timestamp"], columns="variable_name", values="value") - .reset_index() - # Filter off rows with 0 sample_size - .query("pcr_tests_total > 0") - # pcr_tests_positive from the CDC is actually positivity rate (percentage) - .rename(columns={"pcr_tests_positive": "pcr_positivity_rate"}) - ) - - df_tests["pcr_positivity_rate"] /= 100 - df_tests["pcr_tests_positive"] = df_tests.pcr_positivity_rate * df_tests.pcr_tests_total - - return df_tests diff --git a/covid_act_now/delphi_covid_act_now/run.py b/covid_act_now/delphi_covid_act_now/run.py deleted file mode 100644 index 7cc96f6e4..000000000 --- a/covid_act_now/delphi_covid_act_now/run.py +++ /dev/null @@ -1,92 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions to call when running the function. - -This module should contain a function called `run_module`, that is executed -when the module is run with `python -m delphi_covid_act_now`. -""" -from datetime import datetime -import time - -import numpy as np - -from delphi_utils import ( - create_export_csv, - get_structured_logger -) - -from .constants import GEO_RESOLUTIONS, SIGNALS -from .geo import geo_map -from .pull import load_data, extract_testing_metrics - -def run_module(params): - """ - Run the CAN testing metrics indicator. - - Parameters - ---------- - params - Dictionary containing indicator configuration. Expected to have the following structure: - - "common": - - "export_dir": str, directory to write output - - "indicator": - - "parquet_url": str, URL of source file in parquet format - - "archive" (optional): if provided, output will be archived with S3 - - "cache_dir": str, directory of locally cached data - - "bucket_name: str, name of S3 bucket to read/write - - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation) - """ - start_time = time.time() - logger = get_structured_logger( - __name__, filename=params["common"].get("log_filename"), - log_exceptions=params["common"].get("log_exceptions", True)) - - # Configuration - export_dir = params["common"]["export_dir"] - parquet_url = params["indicator"]["parquet_url"] - - # Load CAN county-level testing data - logger.info("Pulling CAN data") - df_pq = load_data(parquet_url) - df_county_testing = extract_testing_metrics(df_pq) - - num_exported_files = 0 - min_dates_exported = [] - max_dates_exported = [] - # Perform geo aggregations and export to receiving - for geo_res in GEO_RESOLUTIONS: - logger.info("Generating signal and exporting to CSV", - geo_res = geo_res) - df = geo_map(df_county_testing, geo_res) - - # Export 'pcr_specimen_positivity_rate' - exported_csv_dates = create_export_csv( - df, - export_dir=export_dir, - geo_res=geo_res, - sensor=SIGNALS[0]) - - # Export 'pcr_specimen_total_tests' - df["val"] = df["sample_size"] - df["sample_size"] = np.nan - df["se"] = np.nan - exported_csv_dates = create_export_csv( - df, - export_dir=export_dir, - geo_res=geo_res, - sensor=SIGNALS[1]) - - earliest, latest = min(exported_csv_dates), max(exported_csv_dates) - min_dates_exported.append(earliest) - max_dates_exported.append(latest) - # x2 to count both positivity and tests signals - num_exported_files += exported_csv_dates.size * 2 - logger.info("Exported for dates between", earliest=earliest, latest=latest) - - elapsed_time_in_seconds = round(time.time() - start_time, 2) - max_lag_in_days = (datetime.now() - min(max_dates_exported)).days - logger.info("Completed indicator run", - elapsed_time_in_seconds=elapsed_time_in_seconds, - csv_export_count=num_exported_files, - max_lag_in_days=max_lag_in_days, - earliest_export_date=min(min_dates_exported).strftime("%Y-%m-%d"), - latest_export_date=max(max_dates_exported).strftime("%Y-%m-%d")) diff --git a/covid_act_now/params.json.template b/covid_act_now/params.json.template deleted file mode 100644 index 8774fd064..000000000 --- a/covid_act_now/params.json.template +++ /dev/null @@ -1,51 +0,0 @@ -{ - "common": { - "export_dir": "./receiving", - "log_filename": "./covid_act_now.log" - }, - "indicator": { - "parquet_url": "https://storage.googleapis.com/can-scrape-outputs/final/can_scrape_api_covid_us.parquet" - }, - "archive": { - "cache_dir": "./cache", - "bucket_name": "", - "indicator_prefix": "CAN", - "aws_credentials": { - "aws_access_key_id": "", - "aws_secret_access_key": "" - } - }, - "validation": { - "common": { - "data_source": "covid-act-now", - "span_length": 14, - "min_expected_lag": {"all": "3"}, - "max_expected_lag": {"all": "9"}, - "dry_run": true, - "suppressed_errors": [ - {"check_name": "check_se_many_missing", - "signal": "pcr_specimen_total_tests"}, - {"check_name": "check_se_not_missing_and_in_range", - "signal": "pcr_specimen_total_tests"}, - {"check_name": "check_n_missing", - "signal": "pcr_specimen_total_tests"}, - {"check_name": "check_se_0_when_val_0"}, - {"check_name": "check_test_vs_reference_avg_changed", - "signal": "pcr_specimen_positivity_rate"} - ] - }, - "static": { - "minimum_sample_size": 0, - "missing_se_allowed": false, - "missing_sample_size_allowed": false - }, - "dynamic": { - "ref_window_size": 7, - "smoothed_signals": [ - ] - } - }, - "delivery": { - "delivery_dir": "./receiving" - } -} \ No newline at end of file diff --git a/covid_act_now/setup.py b/covid_act_now/setup.py deleted file mode 100644 index 03ddecc47..000000000 --- a/covid_act_now/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -from setuptools import setup -from setuptools import find_packages - -required = [ - "numpy", - "pandas", - "pydocstyle", - "pytest", - "pytest-cov", - "pylint==2.8.3", - "delphi-utils", - "covidcast", - "pyarrow", -] - -setup( - name="delphi_covid_act_now", - version="0.1.0", - description="Indicators from COVID Act Now", - author="Eu Jing Chua", - author_email="eujingc@andrew.cmu.edu", - url="https://github.com/cmu-delphi/covidcast-indicators", - install_requires=required, - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Programming Language :: Python :: 3.8", - ], - packages=find_packages(), -) diff --git a/covid_act_now/static/.gitignore b/covid_act_now/static/.gitignore deleted file mode 100644 index e69de29bb..000000000 diff --git a/covid_act_now/tests/conftest.py b/covid_act_now/tests/conftest.py deleted file mode 100644 index 486fde6cb..000000000 --- a/covid_act_now/tests/conftest.py +++ /dev/null @@ -1,161 +0,0 @@ -# -*- coding: utf-8 -*- -from os import listdir, remove -from os.path import join - -from boto3 import Session -from moto import mock_s3 -import numpy as np -import pandas as pd -import pytest - - -@pytest.fixture(scope="session") -def clean_receiving_dir(): - # Clean receiving directory - for fname in listdir("receiving"): - if fname not in (".gitkeep", ".gitignore"): - remove(join("receiving", fname)) - - -@pytest.fixture -def CAN_parquet_data(): - columns = ["provider", "dt", "location_id", "location", "location_type", "variable_name", - "measurement", "unit", "age", "race", "ethnicity", "sex", "last_updated", "value"] - data = [ - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01001", 1001, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 50.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01003", 1003, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 25.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01005", 1005, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 50.0], - - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01001", 1001, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 10.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01003", 1003, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 20.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01005", 1005, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 20.0], - - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42001", 42001, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 50.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42003", 42003, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 20.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42005", 42005, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 10.0], - - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42001", 42001, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 10.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42003", 42003, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 20.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42005", 42005, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 10.0], - - ["SOME_SOURCE", "2021-01-15", "iso1:us#iso2:us-fl#fips:12093", 12093, "county", "SOME_OTHER_METRIC", - "SOME_MEASUREMENT", "SOME_UNITS", "all", "all", "all", "all", "2021-01-21 19:00:00", 123.0], - ] - - df_pq = pd.DataFrame(data, columns=columns) - - return df_pq - -@pytest.fixture -def CAN_county_testing_data(): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - data = [ - ["01001", "2021-01-01", 5, 10, 0.5], - ["01003", "2021-01-01", 5, 20, 0.25], - ["01005", "2021-01-01", 10, 20, 0.5], - ["42001", "2021-01-01", 5, 10, 0.5], - ["42003", "2021-01-01", 4, 20, 0.2], - ["42005", "2021-01-01", 1, 10, 0.1], - ] - - df = pd.DataFrame(data, columns=columns) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - p, n = df.pcr_positivity_rate, df.pcr_tests_total - df["se"] = np.sqrt(p * (1 - p) / n) - - return df - -@pytest.fixture -def CAN_state_testing_data(): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - data = [ - ["al", "2021-01-01", 20, 50, 0.4], - ["pa", "2021-01-01", 10, 40, 0.25] - ] - - df = pd.DataFrame(data, columns=columns) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - p, n = df.pcr_positivity_rate, df.pcr_tests_total - df["se"] = np.sqrt(p * (1 - p) / n) - - return df - -@pytest.fixture -def CAN_msa_testing_data(): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - data = [ - ["19300", "2021-01-01", 5, 20, 0.25], - ["23900", "2021-01-01", 5, 10, 0.5], - ["33860", "2021-01-01", 5, 10, 0.5], - ["38300", "2021-01-01", 5, 30, 5 / 30], - ] - - df = pd.DataFrame(data, columns=columns) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - p, n = df.pcr_positivity_rate, df.pcr_tests_total - df["se"] = np.sqrt(p * (1 - p) / n) - - return df - -@pytest.fixture -def CAN_hrr_testing_data(): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - data = [ - ["1", "2021-01-01", 0.195525, 0.391050, 0.5], - ["134", "2021-01-01", 0.159989, 0.639958, 0.25], - ["2", "2021-01-01", 9.743599, 19.487198, 0.5], - ["351", "2021-01-01", 0.0145052, 0.145052, 0.1], - ["352", "2021-01-01", 2.690298, 5.380595, 0.5], - ["357", "2021-01-01", 4.985495, 29.854948, 0.166991], - ["363", "2021-01-01", 2.309702, 4.619405, 0.5], - ["6", "2021-01-01", 4.840011, 19.360042, 0.25], - ["7", "2021-01-01", 5.060876, 10.121752, 0.5], - ] - - df = pd.DataFrame(data, columns=columns) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - p, n = df.pcr_positivity_rate, df.pcr_tests_total - df["se"] = np.sqrt(p * (1 - p) / n) - - return df - -@pytest.fixture -def CAN_hhs_testing_data(): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - data = [ - ["3", "2021-01-01", 10, 40, 0.25], - ["4", "2021-01-01", 20, 50, 0.4], - ] - - df = pd.DataFrame(data, columns=columns) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - p, n = df.pcr_positivity_rate, df.pcr_tests_total - df["se"] = np.sqrt(p * (1 - p) / n) - - return df - -@pytest.fixture -def CAN_nation_testing_data(): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - data = [ - ["us", "2021-01-01", 30, 90, 30 / 90], - ] - - df = pd.DataFrame(data, columns=columns) - df["timestamp"] = pd.to_datetime(df["timestamp"]) - p, n = df.pcr_positivity_rate, df.pcr_tests_total - df["se"] = np.sqrt(p * (1 - p) / n) - - return df diff --git a/covid_act_now/tests/receiving/.gitignore b/covid_act_now/tests/receiving/.gitignore deleted file mode 100644 index afed0735d..000000000 --- a/covid_act_now/tests/receiving/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.csv diff --git a/covid_act_now/tests/test_data/small_CAN_data.parquet b/covid_act_now/tests/test_data/small_CAN_data.parquet deleted file mode 100644 index d58e0b46d..000000000 Binary files a/covid_act_now/tests/test_data/small_CAN_data.parquet and /dev/null differ diff --git a/covid_act_now/tests/test_geo.py b/covid_act_now/tests/test_geo.py deleted file mode 100644 index 0707b1642..000000000 --- a/covid_act_now/tests/test_geo.py +++ /dev/null @@ -1,116 +0,0 @@ - -import numpy as np -import pandas as pd -import pytest - -from delphi_covid_act_now.geo import ( - positivity_rate, - std_err, - geo_map -) - -class TestAggregationFunctions: - def test_pos_rate(self): - df = pd.DataFrame({ - "pcr_tests_positive": [0, 1, 2, 3, 4, 5], - "sample_size": [2, 2, 5, 10, 20, 50] - }) - - # The 0 sample_size case is expected to return 0 following the CDC's convention - expected_pos_rate = [0, 0.5, 0.4, 0.3, 0.2, 0.1] - pos_rate = positivity_rate(df) - - assert np.allclose(pos_rate, expected_pos_rate) - - def test_std_err(self): - df = pd.DataFrame({ - "val": [0, 0.5, 0.4, 0.3, 0.2, 0.1], - "sample_size": [2, 2, 5, 10, 20, 50] - }) - - expected_se = np.sqrt(df.val * (1 - df.val) / df.sample_size) - se = std_err(df) - - # 0 se is permitted in this indicator, since applying the Jeffreys prior would violate the mirror - assert (se >= 0).all() - assert not np.isnan(se).any() - assert not np.isinf(se).any() - assert np.allclose(se, expected_se, equal_nan=True) - -class TestGeoMap: - def test_incorrect_geo(self, CAN_county_testing_data): - df_county = CAN_county_testing_data - - with pytest.raises(ValueError): - geo_map(df_county, "INVALID_GEO_RES") - - def test_incorrect_total(self): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - df_county = pd.DataFrame([ - ["01001", "2021-01-01", 20, 10, 2.0] - ], columns=columns) - - with pytest.raises(ValueError): - geo_map(df_county, "county") - - def test_zero_sample_size(self): - columns = ["fips", "timestamp", "pcr_tests_positive", "pcr_tests_total", "pcr_positivity_rate"] - df_county = pd.DataFrame([ - ["01001", "2021-01-01", 0, 0, 0] - ], columns=columns) - - with pytest.raises(ValueError): - geo_map(df_county, "county") - - def test_county(self, CAN_county_testing_data): - df_county = CAN_county_testing_data - df_new = geo_map(df_county, "county") - - assert np.allclose(df_new["val"], df_county["pcr_positivity_rate"]) - assert np.allclose(df_new["sample_size"], df_county["pcr_tests_total"]) - assert np.allclose(df_new["se"], df_county["se"], equal_nan=True) - - def test_state(self, CAN_county_testing_data, CAN_state_testing_data): - df_county = CAN_county_testing_data - df_state = CAN_state_testing_data - df_new = geo_map(df_county, "state") - - assert np.allclose(df_new["val"], df_state["pcr_positivity_rate"]) - assert np.allclose(df_new["sample_size"], df_state["pcr_tests_total"]) - assert np.allclose(df_new["se"], df_state["se"], equal_nan=True) - - def test_msa(self, CAN_county_testing_data, CAN_msa_testing_data): - df_county = CAN_county_testing_data - df_msa = CAN_msa_testing_data - df_new = geo_map(df_county, "msa") - - assert np.allclose(df_new["val"], df_msa["pcr_positivity_rate"]) - assert np.allclose(df_new["sample_size"], df_msa["pcr_tests_total"]) - assert np.allclose(df_new["se"], df_msa["se"], equal_nan=True) - - def test_hrr(self, CAN_county_testing_data, CAN_hrr_testing_data): - df_county = CAN_county_testing_data - df_hrr = CAN_hrr_testing_data - df_new = geo_map(df_county, "hrr") - - assert np.allclose(df_new["val"], df_hrr["pcr_positivity_rate"]) - assert np.allclose(df_new["sample_size"], df_hrr["pcr_tests_total"]) - assert np.allclose(df_new["se"], df_hrr["se"], equal_nan=True) - - def test_hhs(self, CAN_county_testing_data, CAN_hhs_testing_data): - df_county = CAN_county_testing_data - df_hhs = CAN_hhs_testing_data - df_new = geo_map(df_county, "hhs") - - assert np.allclose(df_new["val"], df_hhs["pcr_positivity_rate"]) - assert np.allclose(df_new["sample_size"], df_hhs["pcr_tests_total"]) - assert np.allclose(df_new["se"], df_hhs["se"], equal_nan=True) - - def test_nation(self, CAN_county_testing_data, CAN_nation_testing_data): - df_county = CAN_county_testing_data - df_nation = CAN_nation_testing_data - df_new = geo_map(df_county, "nation") - - assert np.allclose(df_new["val"], df_nation["pcr_positivity_rate"]) - assert np.allclose(df_new["sample_size"], df_nation["pcr_tests_total"]) - assert np.allclose(df_new["se"], df_nation["se"], equal_nan=True) diff --git a/covid_act_now/tests/test_pull.py b/covid_act_now/tests/test_pull.py deleted file mode 100644 index 96593005d..000000000 --- a/covid_act_now/tests/test_pull.py +++ /dev/null @@ -1,62 +0,0 @@ -import numpy as np -import pandas as pd -import pytest - -from delphi_covid_act_now.pull import ( - load_data, - extract_testing_metrics -) - -class TestPull: - def test_load_data(self, CAN_parquet_data, tmp_path): - path = tmp_path / "small_CAN_data.parquet" - CAN_parquet_data.to_parquet(path) - - df_pq = load_data(path) - - impt_cols = set([ - "fips", "timestamp", - "age", "ethnicity", "sex", - "location_type", "provider", "variable_name" - ]) - - assert impt_cols <= set(df_pq.columns) - - def test_zero_sample_size(self): - columns = ["provider", "timestamp", "location_id", "fips", "location_type", "variable_name", - "measurement", "unit", "age", "race", "ethnicity", "sex", "last_updated", "value"] - df_pq = pd.DataFrame([ - # Should become a zero sample_size row - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01001", 1001, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 0.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-al#fips:01001", 1001, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 0.0], - - # A non-zero sample_size row - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42001", 42001, "county", "pcr_tests_positive", - "rolling_average_7_day", "percentage", "all", "all", "all", "all", "2021-01-02 19:00:00", 50.0], - ["cdc", "2021-01-01", "iso1:us#iso2:us-pa#fips:42001", 42001, "county", "pcr_tests_total", - "rolling_average_7_day", "specimens", "all", "all", "all", "all", "2021-01-02 19:00:00", 10.0], - ], columns=columns) - - df_tests = extract_testing_metrics(df_pq) - - assert (df_tests.pcr_tests_total > 0).all() - - def test_extract_testing_data(self, CAN_parquet_data, tmp_path): - path = tmp_path / "small_CAN_data.parquet" - CAN_parquet_data.to_parquet(path) - - df_pq = load_data(path) - df_tests = extract_testing_metrics(df_pq) - - impt_cols = set([ - "fips", "timestamp", - "pcr_positivity_rate", "pcr_tests_positive", "pcr_tests_total", - ]) - - assert impt_cols <= set(df_tests.columns) - assert df_tests["pcr_positivity_rate"].between(0, 1).all() - assert np.allclose( - df_tests.pcr_tests_positive, - df_tests.pcr_positivity_rate * df_tests.pcr_tests_total) diff --git a/covid_act_now/tests/test_run.py b/covid_act_now/tests/test_run.py deleted file mode 100644 index 7cec2e1dc..000000000 --- a/covid_act_now/tests/test_run.py +++ /dev/null @@ -1,36 +0,0 @@ -from os import listdir -from os.path import join - -import pandas as pd -import pytest - -from delphi_covid_act_now.constants import GEO_RESOLUTIONS, SIGNALS -from delphi_covid_act_now.run import run_module - -class TestRun: - PARAMS = { - "common": { - "export_dir": "./receiving" - }, - "indicator": { - "parquet_url": "./test_data/small_CAN_data.parquet" - } - } - - def test_output_files(self, clean_receiving_dir): - run_module(self.PARAMS) - csv_files = set(listdir("receiving")) - csv_files.discard(".gitignore") - - expected_files = set() - for signal in SIGNALS: - for geo in GEO_RESOLUTIONS: - expected_files.add(f"20210101_{geo}_{signal}.csv") - - # All output files exist - assert csv_files == expected_files - - # All output files have correct columns - for csv_file in csv_files: - df = pd.read_csv(join("receiving", csv_file)) - assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all()