Skip to content

Commit

Permalink
Merge pull request #327 from cmu-delphi/sir-gapdetector
Browse files Browse the repository at this point in the history
Add a gap detector to Sir Complains-a-lot
  • Loading branch information
krivard authored Oct 19, 2020
2 parents 261503a + 39df546 commit 0783ebd
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 7 deletions.
68 changes: 62 additions & 6 deletions sir_complainsalot/delphi_sir_complainsalot/check_source.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from dataclasses import dataclass
from typing import List

import covidcast
import numpy as np
import pandas as pd

@dataclass
Expand All @@ -27,33 +29,87 @@ def to_md(self):
message=self.message, updated=self.last_updated.strftime("%Y-%m-%d"))

def check_source(data_source, meta, params, grace):
"""Iterate over all signals from a source and check if they exceed max age."""
"""Iterate over all signals from a source and check for problems.
Possible problems:
- Newest available data exceeds max age.
- Gap between subsequent data points exceeds max gap.
For example, consider a source with a max age of 5 days and max gap of 1
day. If today is 2020-10-15, and the latest available data is from
2020-10-09, the max age is exceeded. If there is no data available on
2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
days and the max gap is exceeded.
The gap window controls how much data we check for gaps -- a gap window of
10 days means we check the most recent 10 days of data. Defaults to 7.
"""

source_config = params[data_source]
gap_window = pd.Timedelta(days=source_config.get("gap_window", 7))
max_allowed_gap = source_config.get("max_gap", 1)

signals = meta[meta.data_source == data_source]

now = pd.Timestamp.now()

complaints = {}
age_complaints = {}
gap_complaints = {}

for _, row in signals.iterrows():
if "retired-signals" in source_config and \
row["signal"] in source_config["retired-signals"]:
continue

# Check max age
age = (now - row["max_time"]).days

if age > source_config["max_age"] + grace:
if row["signal"] not in complaints:
complaints[row["signal"]] = Complaint(
if row["signal"] not in age_complaints:
age_complaints[row["signal"]] = Complaint(
"is more than {age} days old".format(age=age),
data_source,
row["signal"],
[row["geo_type"]],
row["max_time"],
source_config["maintainers"])
else:
complaints[row["signal"]].geo_types.append(row["geo_type"])
age_complaints[row["signal"]].geo_types.append(row["geo_type"])

# Check max gap
if max_allowed_gap == -1:
# No gap detection for this source
continue

latest_data = covidcast.signal(
data_source, row["signal"],
start_day=row["max_time"] - gap_window,
end_day=row["max_time"],
geo_type=row["geo_type"]
)

# convert numpy datetime values to pandas datetimes and then to
# datetime.date, so we can work with timedeltas after
unique_dates = [pd.to_datetime(val).date()
for val in latest_data["time_value"].unique()]

gap_days = [(day - prev_day).days
for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])]
gap = max(gap_days)

if gap > max_allowed_gap:
if row["signal"] not in gap_complaints:
gap_complaints[row["signal"]] = Complaint(
"has a {gap}-day gap of missing data in its most recent "
"{gap_window} days of data".format(gap=gap, gap_window=gap_window.days),
data_source,
row["signal"],
[row["geo_type"]],
row["max_time"],
source_config["maintainers"])
else:
gap_complaints[row["signal"]].geo_types.append(row["geo_type"])

return list(complaints.values())
return list(age_complaints.values()) + list(gap_complaints.values())
2 changes: 1 addition & 1 deletion sir_complainsalot/delphi_sir_complainsalot/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def run_module():

complaints = []
for data_source in params["sources"].keys():
complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace",0)))
complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace", 0)))

if len(complaints) > 0:
for complaint in complaints:
Expand Down
9 changes: 9 additions & 0 deletions sir_complainsalot/params.json.template
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
"max_age": 5,
"maintainers": ["U010VE2T51N"]
},
"hospital-admissions": {
"max_age": 5,
"maintainers": ["U010VE2T51N"],
"retired-signals": ["smoothed_covid19", "smoothed_adj_covid19"]
},
"ght": {
"max_age": 5,
"maintainers": ["U010VE2T51N"]
Expand All @@ -14,6 +19,10 @@
"max_age": 2,
"maintainers": ["UUCGWMJ5P"]
},
"usa-facts": {
"max_age": 2,
"maintainers": ["UUCGWMJ5P"]
},
"safegraph": {
"max_age": 4,
"maintainers": ["U010VE2T51N"]
Expand Down

0 comments on commit 0783ebd

Please sign in to comment.