Skip to content

Commit

Permalink
Nans changehc:
Browse files Browse the repository at this point in the history
* allow nan values, add missing columns, and test
  • Loading branch information
dshemetov committed Apr 27, 2021
1 parent 9b75e07 commit 2801e6e
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 18 deletions.
29 changes: 24 additions & 5 deletions changehc/delphi_changehc/update_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# third party
import numpy as np
import pandas as pd
from delphi_utils import GeoMapper, read_params, add_prefix
from delphi_utils import GeoMapper, add_prefix, Nans

# first party
from .config import Config
Expand Down Expand Up @@ -46,7 +46,7 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."):
out_name,
)
with open(filename, "w") as outfile:
outfile.write("geo_id,val,se,direction,sample_size\n")
outfile.write("geo_id,val,se,direction,sample_size,missing_val,missing_se,missing_sample_size\n")
for geo_id in geo_ids:
sensor = all_rates[geo_id][i]
se = all_se[geo_id][i]
Expand All @@ -57,17 +57,36 @@ def write_to_csv(output_dict, write_se, out_name, output_path="."):
logging.warning("value suspiciously high, {0}: {1}".format(
geo_id, sensor
))
assert se < 5, f"se suspiciously high, {geo_id}: {se}"
assert se < 5, "se is suspiciously high, {0}: {1}".format(
geo_id, sensor
)
if write_se:
assert sensor > 0 and se > 0, "p=0, std_err=0 invalid"
outfile.write(
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, se, NA, NA))
"%s,%f,%s,%s,%s,%d,%d,%d\n" % (
geo_id, sensor, se, NA, NA,
Nans.NOT_MISSING.value, Nans.NOT_MISSING.value, Nans.PRIVACY.value
)
)
else:
# for privacy reasons we will not report the standard error
outfile.write(
"%s,%f,%s,%s,%s\n" % (geo_id, sensor, NA, NA, NA)
"%s,%f,%s,%s,%s,%d,%d,%d\n" % (
geo_id, sensor, NA, NA, NA,
Nans.NOT_MISSING.value, Nans.PRIVACY.value, Nans.PRIVACY.value
)
)
out_n += 1
else:
logging.warning("writing insufficient data for geo_id {0}, {1}".format(
geo_id, i
))
outfile.write(
"%s,%s,%s,%s,%s,%d,%d,%d\n" % (
geo_id, NA, NA, NA, NA,
Nans.PRIVACY.value, Nans.PRIVACY.value, Nans.NOT_APPLICABLE.value
)
)
logging.debug("wrote {0} rows for {1} {2}".format(
out_n, len(geo_ids), geo_level
))
Expand Down
29 changes: 16 additions & 13 deletions changehc/tests/test_update_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def test_geo_reindex(self):
def test_update_sensor(self):
"""Tests that the sensors are properly updated."""
outputs = {}
for geo in ["county", "state", "hhs", "nation"]:
geos = ["county", "state", "hhs", "nation"]
for geo in geos:
td = TemporaryDirectory()
su_inst = CHCSensorUpdator(
"03-01-2020",
Expand All @@ -116,17 +117,17 @@ def test_update_sensor(self):
"den": [30, 50, 50, 10, 1, 5, 5, 50, 50, 50, 0, 0, 0] * 2,
"date": list(pd.date_range("20200301", "20200313")) * 2}).set_index(
["fips", "date"])
# breakpoint()
su_inst.update_sensor(small_test_data, td.name)
for f in os.listdir(td.name):
outputs[f] = pd.read_csv(os.path.join(td.name, f))
assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\
f"failed {geo} update sensor test"
td.cleanup()
assert outputs["20200319_county_smoothed_outpatient_covid.csv"].empty
assert outputs["20200319_state_smoothed_outpatient_covid.csv"].empty
assert outputs["20200319_hhs_smoothed_outpatient_covid.csv"].empty
assert outputs["20200319_nation_smoothed_outpatient_covid.csv"].empty

value_columns = ["val", "se", "direction", "sample_size"]
for geo in geos:
assert np.isnan(outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"][value_columns]).all().all()
assert outputs["20200319_" + geo + "_smoothed_outpatient_covid.csv"]["missing_val"].eq(3).all()

class TestWriteToCsv:
"""Tests for writing output files to CSV."""
Expand Down Expand Up @@ -161,8 +162,9 @@ def test_write_to_csv_results(self):
expected_name = "20200502_geography_name_of_signal.csv"
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))
expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([0.1, 1]))
Expand All @@ -175,11 +177,12 @@ def test_write_to_csv_results(self):
expected_name = "20200503_geography_name_of_signal.csv"
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))

assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a"]).all()
assert np.array_equal(output_data.val.values, np.array([0.5]))
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([0.5, np.nan]), equal_nan=True)
assert np.isnan(output_data.se.values).all()
assert np.isnan(output_data.direction.values).all()
assert np.isnan(output_data.sample_size.values).all()
Expand All @@ -188,7 +191,7 @@ def test_write_to_csv_results(self):
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))
assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([1.5, 3]))
Expand Down Expand Up @@ -224,13 +227,13 @@ def test_write_to_csv_with_se_results(self):

td = TemporaryDirectory()
write_to_csv(res0, True, "name_of_signal", td.name)

# check outputs
expected_name = "20200502_geography_name_of_signal.csv"
expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))
assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([0.1, 1]))
Expand Down

0 comments on commit 2801e6e

Please sign in to comment.