Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1972 replace covidcast #2056

Open
wants to merge 58 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
4afe0f0
in progress for replacing covidcast
aysim319 Jul 9, 2024
67a312c
moving wrapper in seperate module
aysim319 Jul 25, 2024
e4f2679
working on test
aysim319 Jul 29, 2024
c941982
post process for metadata
aysim319 Jul 29, 2024
a5628ac
lint and cleanup
aysim319 Jul 30, 2024
6e22db8
fixing for test
aysim319 Jul 30, 2024
a2a149f
implimentating suggested changes
aysim319 Aug 1, 2024
9269621
removing wrapper and directly converting in other places and moving t…
aysim319 Aug 5, 2024
8e67b6c
modifing test
aysim319 Aug 5, 2024
bf21d33
more suggestion
aysim319 Aug 6, 2024
d76cd40
sircomplainslot needs more filtering
aysim319 Aug 7, 2024
fd50d9d
adding credential for google symptoms
aysim319 Aug 8, 2024
76f1519
mocking api call in google symptoms
aysim319 Aug 8, 2024
157c6c6
organizing validations
aysim319 Aug 8, 2024
23384e7
extended date range and throws error when comes empty
aysim319 Aug 9, 2024
e5c3b46
delphi_utils/validator/datafetcher.py
aysim319 Aug 9, 2024
33936a4
test+refactor: tweak covidcast port tests and
dshemetov Aug 9, 2024
01a7f66
fix: don't parse datetimes on unused columns
dshemetov Aug 9, 2024
6eeef4e
lint: remove unused types
dshemetov Aug 9, 2024
1f59e06
fix: dont import covidcast in sir_complainsalot
dshemetov Aug 9, 2024
7f60275
fix: remove covidcast from indicator setup.py dependencies
dshemetov Aug 9, 2024
55150bc
fix: remove duplicate ported_signal
dshemetov Aug 10, 2024
329d340
fix: revert _parse_datetimes
dshemetov Aug 12, 2024
9d91be7
adding conditional to fail if api fails
aysim319 Aug 22, 2024
5ac98ab
change
aysim319 Sep 13, 2024
b92695a
Merge branch 'main' into 1972-replace-covidcast
aysim319 Sep 13, 2024
15ce75f
merge change that didn't make it for some reason
aysim319 Sep 13, 2024
6ee3e9e
lint
aysim319 Sep 13, 2024
f25605d
lint again
aysim319 Sep 13, 2024
2654946
fixing logic
aysim319 Sep 13, 2024
c63f095
remove covidcast from pyproject.toml
aysim319 Sep 13, 2024
79bf550
fix tests
aysim319 Sep 13, 2024
4c44d3a
need to update requirements
aysim319 Sep 13, 2024
8a308c4
fix test
aysim319 Sep 13, 2024
b4039c5
lint and fix package
aysim319 Sep 13, 2024
4916465
fix test
aysim319 Sep 13, 2024
670bf04
lint
aysim319 Sep 13, 2024
2f94d15
lint
aysim319 Sep 13, 2024
57fc591
suggested changes
aysim319 Sep 18, 2024
913c72f
fixed test
aysim319 Sep 18, 2024
a9cebed
handle check more gracefully
aysim319 Sep 18, 2024
e30aaca
export date util
aysim319 Sep 18, 2024
733c85e
wrap around try except for sircal
aysim319 Sep 19, 2024
f61462a
merge conflict
aysim319 Sep 19, 2024
06fafd0
lint
aysim319 Sep 19, 2024
d3bc895
remove former testing script
aysim319 Sep 19, 2024
68e2850
lint and fixing missing params
aysim319 Sep 19, 2024
9ff0979
lint
aysim319 Sep 19, 2024
f7fcefc
fixed test
aysim319 Sep 19, 2024
957af29
lock pandas version
aysim319 Sep 20, 2024
9585196
lint
aysim319 Sep 20, 2024
cf4f06d
lint
aysim319 Sep 20, 2024
b5929af
more fix test
aysim319 Sep 20, 2024
ee64984
lint again
aysim319 Sep 20, 2024
fa9143a
lint
aysim319 Sep 20, 2024
2de9d3b
Merge branch 'main' into 1972-replace-covidcast
aysim319 Nov 5, 2024
a1aad7a
changed based on suggestion
aysim319 Nov 11, 2024
38f25bb
made consistent with actual response
aysim319 Nov 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions _delphi_utils_python/delphi_utils/__init__.py
aysim319 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@

from .archive import ArchiveDiffer, GitArchiveDiffer, S3ArchiveDiffer
from .export import create_export_csv
from .utils import read_params

from .slack_notifier import SlackNotifier
from .logger import get_structured_logger
from .geomap import GeoMapper
from .smooth import Smoother
from .signal import add_prefix
from .logger import get_structured_logger
from .nancodes import Nans
from .signal import add_prefix
from .slack_notifier import SlackNotifier
from .smooth import Smoother
from .utils import read_params
from .weekday import Weekday

__version__ = "0.3.25"
87 changes: 76 additions & 11 deletions _delphi_utils_python/delphi_utils/validator/datafetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

import re
import threading
import warnings
from os import listdir
from os.path import isfile, join
import warnings
import requests
import pandas as pd

import numpy as np
import covidcast
import pandas as pd
import requests
from delphi_epidata import Epidata

from .errors import APIDataFetchError, ValidationFailure

FILENAME_REGEX = re.compile(
Expand Down Expand Up @@ -115,7 +117,22 @@ def get_geo_signal_combos(data_source, api_key):
meta_response.raise_for_status()
source_signal_mappings = {i['source']:i['db_source'] for i in
meta_response.json()}
meta = covidcast.metadata()

response = Epidata.covidcast_meta()

# pylint: disable=R1720
if response["result"] != 1:
# Something failed in the API and we did not get real metadata
raise RuntimeError(
"Error when fetching metadata from the API", response["message"]
)

# pylint: disable=I0021
else:
meta = pd.DataFrame.from_dict(response["epidata"])
# note: this will fail for signals with weekly data, but currently not supported for validation
meta = meta[meta["time_type"] == "day"]
aysim319 marked this conversation as resolved.
Show resolved Hide resolved

source_meta = meta[meta['data_source'] == data_source]
# Need to convert np.records to tuples so they are hashable and can be used in sets and dicts.
geo_signal_combos = list(map(tuple,
Expand Down Expand Up @@ -158,18 +175,66 @@ def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type

Formatting is changed to match that of source data CSVs.
"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
api_df = covidcast.signal(
data_source, signal_type, start_date, end_date, geo_type)
if start_date > end_date:
raise ValueError(
"end_day must be on or after start_day, but "
f"start_day = '{start_date}', end_day = '{end_date}'"
)
aysim319 marked this conversation as resolved.
Show resolved Hide resolved
response = Epidata.covidcast(
data_source,
signal_type,
time_type="day",
geo_type=geo_type,
time_values=Epidata.range(
start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")
),
geo_value="*",
)
if response["result"] != 1:
# Something failed in the API and we did not get real signal data
raise RuntimeError(
"Error when fetching signal data from the API", response["message"]
)
aysim319 marked this conversation as resolved.
Show resolved Hide resolved

# pylint: disable=E1124
if response["message"] not in {"success", "no results"}:
# pylint: disable=E1123
warnings.warn(
"Problem obtaining data",
# pylint: disable=E0602
RuntimeWarning,
message=response["message"],
data_source=data_source,
signal=signal,
time_value=params["time_values"],
geo_type=geo_type,
)
response = Epidata.covidcast(
data_source,
signal_type,
time_type="day",
geo_type=geo_type,
time_values=Epidata.range(
start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")
),
geo_value="*",
)

aysim319 marked this conversation as resolved.
Show resolved Hide resolved
api_df = None
if len(response["epidata"]) > 0:
api_df = pd.DataFrame.from_dict(response["epidata"])
# note: this will fail for signals with weekly data, but currently not supported for validation
api_df["issue"] = pd.to_datetime(api_df["issue"], format="%Y%m%d")
api_df["time_value"] = pd.to_datetime(api_df["time_value"], format="%Y%m%d")
api_df.drop("direction", axis=1, inplace=True)
api_df["data_source"] = data_source
api_df["signal"] = signal_type

error_context = f"when fetching reference data from {start_date} to {end_date} " +\
f"for data source: {data_source}, signal type: {signal_type}, geo type: {geo_type}"

if api_df is None:
raise APIDataFetchError("Error: no API data was returned " + error_context)
aysim319 marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(api_df, pd.DataFrame):
raise APIDataFetchError("Error: API return value was not a dataframe " + error_context)

column_names = ["geo_id", "val",
"se", "sample_size", "time_value"]
Expand Down
14 changes: 7 additions & 7 deletions _delphi_utils_python/delphi_utils/validator/dynamic.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""Dynamic file checks."""

import re
from dataclasses import dataclass
from datetime import date, timedelta
from typing import Dict, Set
import re
import pandas as pd

import numpy as np
import covidcast
from .errors import ValidationFailure
import pandas as pd

from .datafetcher import get_geo_signal_combos, threaded_api_calls
from .utils import relative_difference_by_min, TimeWindow, lag_converter
from .errors import ValidationFailure
from .utils import TimeWindow, lag_converter, relative_difference_by_min


class DynamicValidator:
Expand Down Expand Up @@ -78,8 +80,6 @@ def validate(self, all_frames, report):
# Get 14 days prior to the earliest list date
outlier_lookbehind = timedelta(days=14)

# Authenticate API
covidcast.use_api_key(self.params.api_key)

# Get all expected combinations of geo_type and signal.
geo_signal_combos = get_geo_signal_combos(self.params.data_source,
Expand Down
8 changes: 5 additions & 3 deletions _delphi_utils_python/delphi_utils/validator/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
when the module is run with `python -m delphi_utils.validator`.
"""
import argparse as ap
import covidcast
from .. import read_params, get_structured_logger

from delphi_epidata import Epidata

from .. import get_structured_logger, read_params
from .validate import Validator


Expand All @@ -18,7 +20,7 @@ def run_module():
args = parser.parse_args()
params = read_params()
assert "validation" in params
covidcast.use_api_key(params["validation"]["common"]["api_credentials"])
Epidata.auth = ("epidata", params["validation"]["common"]["api_credentials"])
dry_run_param = params["validation"]["common"].get("dry_run", False)
params["validation"]["common"]["dry_run"] = args.dry_run or dry_run_param
validator = Validator(params)
Expand Down
2 changes: 1 addition & 1 deletion _delphi_utils_python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ classifiers = [
]
dependencies = [
"boto3",
"covidcast",
"cvxpy",
"delphi-epidata",
"epiweeks",
"gitpython",
"importlib_resources>=1.3",
Expand Down
28 changes: 28 additions & 0 deletions _delphi_utils_python/tests/test_data/sample_epidata_metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{"data_source": ["chng", "chng", "chng",
"covid-act-now",
"covid-act-now",
"covid-act-now",
"chng"],
"signal": ["smoothed_outpatient_cli",
"smoothed_outpatient_covid",
"smoothed_outpatient_covid",
"pcr_specimen_positivity_rate",
"pcr_specimen_positivity_rate",
"pcr_specimen_total_tests",
"inactive"],
"geo_type": ["state", "state", "county",
"hrr", "msa", "msa",
"state"],
"min_time": ["20200101", "20200101", "20200101",
"20200101", "20200101", "20200101",
"20200101"],
"max_time": ["20240101", "20240101", "20240101",
"20240101", "20240101", "20240101",
"20240101"],
"last_update": [1711963480, 1711963480, 1711963480,
1711963480, 1711963480, 1711963480,
1711963480],
"time_type": ["day", "day", "day",
"day", "day", "day",
"day"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{"geo_value": ["1044"],
"stderr": [null],
"value": [3],
"issue": [20200101],
"lag": [7],
"sample_size": [null],
"time_value": [20200101],
"direction": [null]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{"geo_value": ["0888"],
"stderr": [2],
"value": [14],
"issue": [20200101],
"lag": [1],
"sample_size": [100],
"time_value": [20200101],
"direction": [null]
}
85 changes: 31 additions & 54 deletions _delphi_utils_python/tests/validator/test_datafetcher.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Tests for datafetcher.py."""

from datetime import date
from datetime import date, datetime
import mock
import json
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
Expand All @@ -14,6 +16,7 @@
from delphi_utils.validator.errors import ValidationFailure


TEST_DIR = Path(__file__).parent.parent

class TestDataFetcher:
"""Tests for various data fetching utilities."""
Expand Down Expand Up @@ -45,6 +48,27 @@ def raise_for_status(self):
{'source': 'covid-act-now', 'db_source': 'covid-act-now'}], 200)
elif "params" in kwargs and kwargs["params"] == {'signal': 'chng:inactive'}:
return MockResponse([{"signals": [{"active": False}]}], 200)
elif args[0] == 'https://api.delphi.cmu.edu/epidata/covidcast_meta/' and \
'delphi_epidata' in kwargs["headers"]["user-agent"]:
aysim319 marked this conversation as resolved.
Show resolved Hide resolved
with open(f"{TEST_DIR}/test_data/sample_epidata_metadata.json") as f:
epidata = json.load(f)
response = {"epidata": epidata, "result": 1, "message": "success"}
return MockResponse(response, 200)
elif args[0] == 'https://api.delphi.cmu.edu/epidata/covidcast/' and \
'delphi_epidata' in kwargs["headers"]["user-agent"]:
aysim319 marked this conversation as resolved.
Show resolved Hide resolved
signal_type = args[1].get("signals")
geo_type = args[1].get("geo_type")
if signal_type == "a":
with open(f"{TEST_DIR}/test_data/sample_epidata_signal_a.json") as f:
epidata = json.load(f)
response = {"epidata": epidata, "result": 1, "message": "success"}
return MockResponse(response, 200)
Comment on lines +63 to +66
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love that you broke the sample data out into their own .json files!

However, for accuracy's sake, the format doesnt match actual usage now; youre handing back "HTTP Responses" where the "epidata" payload is transposed/rotated from what it is in practice. The API server would be returning a list of dicts, but this is giving a dict of lists.

AFAICT, the differences come from the old usage of these tests where it was mocking calls to the covidcast library's methods instead of the underlying HTTP Request as it is now -- the old library seems to get these transpositions from its uses of pd.DataFrame.from_dict().

TL;DR: If youre going to mock HTTP requests (which i think is the right thing to do), make them return something that looks like the real HTTP responses. Otherwise, mock a different layer in the call stack.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I compared the output that epidata.covidcast returned and that's how it returns: sample_epidata_covidcast_result.json
sample code

Epidata.debug = True
Epidata.auth = ("epidata", API_KEY)

meta = pd.DataFrame(Epidata.covidcast_meta()["epidata"])

source = "doctor-visits"
nchs_meta = meta[meta["data_source"] == source]

signal = "smoothed_cli"

response6 = Epidata.covidcast(source, signal, time_type="day",
                              geo_type="state", time_values=Epidata.range("20241105", "20241105"),
                              geo_value="*", as_of=None,
                              )
df = pd.DataFrame.from_dict(response6["epidata"])
    ```

if geo_type == "county":
with open(f"{TEST_DIR}/test_data/sample_epidata_signal_county.json") as f:
epidata = json.load(f)
response = {"epidata": epidata, "result": 1, "message": "success"}
return MockResponse(response, 200)
return MockResponse({"epidata": {}, "result": 1, "message": "success"}, 200)
else:
return MockResponse([{"signals": [{"active": True}]}], 200)

Expand All @@ -57,27 +81,9 @@ def test_bad_api_key(self, **kwargs):
get_geo_signal_combos("chng", api_key="")

@mock.patch('requests.get', side_effect=mocked_requests_get)
@mock.patch("covidcast.metadata")
def test_get_geo_signal_combos(self, mock_metadata, mock_get):
def test_get_geo_signal_combos(self, mock_get):

"""Test that the geo signal combos are correctly pulled from the covidcast metadata."""
# Need to use actual data_source and signal names since we reference the API
# We let the chng signal "inactive" be an inactive signal
mock_metadata.return_value = pd.DataFrame({"data_source": ["chng", "chng", "chng",
"covid-act-now",
"covid-act-now",
"covid-act-now",
"chng"],
"signal": ["smoothed_outpatient_cli",
"smoothed_outpatient_covid",
"smoothed_outpatient_covid",
"pcr_specimen_positivity_rate",
"pcr_specimen_positivity_rate",
"pcr_specimen_total_tests",
"inactive"],
"geo_type": ["state", "state", "county",
"hrr", "msa", "msa",
"state"]
})
assert set(get_geo_signal_combos("chng", api_key="")) == set(
[("state", "smoothed_outpatient_cli"),
("state", "smoothed_outpatient_covid"),
Expand All @@ -87,49 +93,20 @@ def test_get_geo_signal_combos(self, mock_metadata, mock_get):
("msa", "pcr_specimen_positivity_rate"),
("msa", "pcr_specimen_total_tests")])

@mock.patch("covidcast.signal")
def test_threaded_api_calls(self, mock_signal):
@mock.patch('requests.get', side_effect=mocked_requests_get)
def test_threaded_api_calls(self, mock_get):
"""Test that calls to the covidcast API are made."""

signal_data_1 = pd.DataFrame({"geo_value": ["1044"],
"stderr": [None],
"value": [3],
"issue": [10],
"lag": [7],
"sample_size": [None],
"time_value": [10]
})
signal_data_2 = pd.DataFrame({"geo_value": ["0888"],
"stderr": [2],
"value": [14],
"issue": [10],
"lag": [1],
"sample_size": [100],
"time_value": [8]
})

def mock_signal_return_fn(unused_data_source, signal_type, unused_start_date,
unused_end_date, geo_type):
"""Function to return data when covidcast.signal() is called."""
if signal_type == "a":
return signal_data_1
if geo_type == "county":
return signal_data_2
return None

mock_signal.side_effect = mock_signal_return_fn

processed_signal_data_1 = pd.DataFrame({"geo_id": ["1044"],
"val": [3],
"se": [np.nan],
"sample_size": [np.nan],
"time_value": [10]
"time_value": [datetime.strptime("20200101", "%Y%m%d")],
})
processed_signal_data_2 = pd.DataFrame({"geo_id": ["0888"],
"val": [14],
"se": [2],
"sample_size": [100],
"time_value": [8]
"time_value": [datetime.strptime("20200101", "%Y%m%d")],
})
expected = {
("county", "a"): processed_signal_data_1,
Expand Down
1 change: 0 additions & 1 deletion changehc/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

required = [
"boto3",
"covidcast",
"darker[isort]~=2.1.1",
"delphi-utils",
"mock",
Expand Down
3 changes: 2 additions & 1 deletion changehc/tests/test_update_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def test_geo_reindex(self):
"timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]})
if geo == "county": # test for rogue \N
row_contain_N = {"num": 700, "fips": r"\N", "den": 2000, "timestamp": pd.Timestamp("03-15-2020")}
test_data = test_data.append(row_contain_N, ignore_index=True)
test_data = pd.concat([test_data, pd.DataFrame([row_contain_N])], ignore_index=True)

data_frame = su_inst.geo_reindex(test_data)
assert data_frame.shape[0] == multiple*len(su_inst.fit_dates)
assert (data_frame.sum(numeric_only=True) == (4200,19000)).all()
Expand Down
Loading
Loading