From 2801e6e2f5c4ebc14f8b417a8fb2dff06a88eed3 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 9 Mar 2021 11:51:22 -0800 Subject: [PATCH] Nans changehc: * allow nan values, add missing columns, and test --- changehc/delphi_changehc/update_sensor.py | 29 +++++++++++++++++++---- changehc/tests/test_update_sensor.py | 29 +++++++++++++---------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py index c0e63af8c0..93ed741ad5 100644 --- a/changehc/delphi_changehc/update_sensor.py +++ b/changehc/delphi_changehc/update_sensor.py @@ -11,7 +11,7 @@ # third party import numpy as np import pandas as pd -from delphi_utils import GeoMapper, read_params, add_prefix +from delphi_utils import GeoMapper, add_prefix, Nans # first party from .config import Config @@ -46,7 +46,7 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."): out_name, ) with open(filename, "w") as outfile: - outfile.write("geo_id,val,se,direction,sample_size\n") + outfile.write("geo_id,val,se,direction,sample_size,missing_val,missing_se,missing_sample_size\n") for geo_id in geo_ids: sensor = all_rates[geo_id][i] se = all_se[geo_id][i] @@ -57,17 +57,36 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."): logging.warning("value suspiciously high, {0}: {1}".format( geo_id, sensor )) - assert se < 5, f"se suspiciously high, {geo_id}: {se}" + assert se < 5, "se is suspiciously high, {0}: {1}".format( + geo_id, sensor + ) if write_se: assert sensor > 0 and se > 0, "p=0, std_err=0 invalid" outfile.write( - "%s,%f,%s,%s,%s\n" % (geo_id, sensor, se, NA, NA)) + "%s,%f,%s,%s,%s,%d,%d,%d\n" % ( + geo_id, sensor, se, NA, NA, + Nans.NOT_MISSING.value, Nans.NOT_MISSING.value, Nans.PRIVACY.value + ) + ) else: # for privacy reasons we will not report the standard error outfile.write( - "%s,%f,%s,%s,%s\n" % (geo_id, sensor, NA, NA, NA) + "%s,%f,%s,%s,%s,%d,%d,%d\n" % ( + geo_id, sensor, NA, NA, NA, + Nans.NOT_MISSING.value, Nans.PRIVACY.value, Nans.PRIVACY.value + ) ) out_n += 1 + else: + logging.warning("writing insufficient data for geo_id {0}, {1}".format( + geo_id, i + )) + outfile.write( + "%s,%s,%s,%s,%s,%d,%d,%d\n" % ( + geo_id, NA, NA, NA, NA, + Nans.PRIVACY.value, Nans.PRIVACY.value, Nans.NOT_APPLICABLE.value + ) + ) logging.debug("wrote {0} rows for {1} {2}".format( out_n, len(geo_ids), geo_level )) diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py index 3ec2dccf6d..61ee1881a5 100644 --- a/changehc/tests/test_update_sensor.py +++ b/changehc/tests/test_update_sensor.py @@ -93,7 +93,8 @@ def test_geo_reindex(self): def test_update_sensor(self): """Tests that the sensors are properly updated.""" outputs = {} - for geo in ["county", "state", "hhs", "nation"]: + geos = ["county", "state", "hhs", "nation"] + for geo in geos: td = TemporaryDirectory() su_inst = CHCSensorUpdator( "03-01-2020", @@ -116,17 +117,17 @@ def test_update_sensor(self): "den": [30, 50, 50, 10, 1, 5, 5, 50, 50, 50, 0, 0, 0] * 2, "date": list(pd.date_range("20200301", "20200313")) * 2}).set_index( ["fips", "date"]) + # breakpoint() su_inst.update_sensor(small_test_data, td.name) for f in os.listdir(td.name): outputs[f] = pd.read_csv(os.path.join(td.name, f)) assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\ f"failed {geo} update sensor test" td.cleanup() - assert outputs["20200319_county_smoothed_outpatient_covid.csv"].empty - assert outputs["20200319_state_smoothed_outpatient_covid.csv"].empty - assert outputs["20200319_hhs_smoothed_outpatient_covid.csv"].empty - assert outputs["20200319_nation_smoothed_outpatient_covid.csv"].empty - + value_columns = ["val", "se", "direction", "sample_size"] + for geo in geos: + assert np.isnan(outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"][value_columns]).all().all() + assert outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"]["missing_val"].eq(3).all() class TestWriteToCsv: """Tests for writing output files to CSV.""" @@ -161,8 +162,9 @@ def test_write_to_csv_results(self): expected_name = "20200502_geography_name_of_signal.csv" assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) + expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"] assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() assert (output_data.geo_id == ["a", "b"]).all() assert np.array_equal(output_data.val.values, np.array([0.1, 1])) @@ -175,11 +177,12 @@ def test_write_to_csv_results(self): expected_name = "20200503_geography_name_of_signal.csv" assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) + assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() - assert (output_data.geo_id == ["a"]).all() - assert np.array_equal(output_data.val.values, np.array([0.5])) + assert (output_data.geo_id == ["a", "b"]).all() + assert np.array_equal(output_data.val.values, np.array([0.5, np.nan]), equal_nan=True) assert np.isnan(output_data.se.values).all() assert np.isnan(output_data.direction.values).all() assert np.isnan(output_data.sample_size.values).all() @@ -188,7 +191,7 @@ def test_write_to_csv_results(self): assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() assert (output_data.geo_id == ["a", "b"]).all() assert np.array_equal(output_data.val.values, np.array([1.5, 3])) @@ -224,13 +227,13 @@ def test_write_to_csv_with_se_results(self): td = TemporaryDirectory() write_to_csv(res0, True, "name_of_signal", td.name) - # check outputs expected_name = "20200502_geography_name_of_signal.csv" + expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"] assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() assert (output_data.geo_id == ["a", "b"]).all() assert np.array_equal(output_data.val.values, np.array([0.1, 1]))