Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lint #7

Merged
merged 10 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ dependencies = [
"plotly>=5.23.0",
"boto3>=1.35.8",
"selenium>=4.24.0",
"fastjsonschema>=2.20.0",
"requests>=2.32.3",
]
scripts = { olm = "olm:main" }

Expand Down
51 changes: 40 additions & 11 deletions src/olm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import sys
import argparse
import webbrowser
import urllib
from pathlib import Path

import requests

from .report import make_report
from .lint import lint
from .outbreaks import OUTBREAKS
from .util import msg_ok, msg_fail

USAGE = """olm: Office for Linelist Management

Expand All @@ -24,7 +28,7 @@


def abort(msg):
print(msg)
msg_fail("cli", msg)
sys.exit(1)


Expand All @@ -35,10 +39,20 @@ def main():

subparsers = parser.add_subparsers(dest="command")

report_parser = subparsers.add_parser("report", help="Generate briefing report")
lint_parser = subparsers.add_parser(
"lint", help="Lint outbreak data according to schema"
)
lint_parser.add_argument("outbreak", help="Outbreak name")
lint_parser.add_argument("--data", help="Data URL")
lint_parser.add_argument("--schema", help="Data schema path or URL")
lint_parser.add_argument("--ignore", help="Ignore fields, comma-separated")

get_parser = subparsers.add_parser("get", help="Get data for outbreak")
get_parser.add_argument("outbreak", help="Outbreak name")

_ = subparsers.add_parser("list", help="List outbreaks managed by olm")

report_parser = subparsers.add_parser("report", help="Generate briefing report")
report_parser.add_argument("outbreak", help="Outbreak name")
report_parser.add_argument("--data", help="Data URL")
report_parser.add_argument(
Expand All @@ -52,24 +66,39 @@ def main():
)

args = parser.parse_args()
if args.command and args.command != "list" and args.outbreak not in OUTBREAKS:
abort(
"outbreak not known, choose from: \033[1m"
+ ", ".join(OUTBREAKS)
+ "\033[0m"
)
bold_outbreak = f"\033[1m{args.outbreak}\033[0m"

match args.command:
case "list":
for outbreak in OUTBREAKS:
print(
f"\033[1m{outbreak:12s} \033[0m{OUTBREAKS[outbreak]['description']} [{OUTBREAKS[outbreak]['id']}]"
)
case "get":
if args.outbreak not in OUTBREAKS:
abort("Outbreak not known. Choose from: " + ", ".join(OUTBREAKS))
if "url" not in OUTBREAKS[args.outbreak]:
abort(f"No data URL found for: {args.outbreak}")
abort(f"no data URL found for {bold_outbreak}")
output_file = f"{args.outbreak}.csv"
with urllib.request.urlopen(OUTBREAKS[args.outbreak]["url"]) as f:
Path(output_file).write_bytes(f.read())
print("wrote", output_file)
if (
res := requests.get(OUTBREAKS[args.outbreak]["url"])
).status_code == 200:
Path(output_file).write_text(res.text)
msg_ok("get", "wrote " + output_file)
case "lint":
ignore_keys = args.ignore.split(",") if args.ignore is not None else []
if (
lint_result := lint(args.outbreak, args.data, args.schema, ignore_keys)
).ok:
msg_ok("lint", "succeeded for " + bold_outbreak)
else:
msg_fail("lint", "failed for " + bold_outbreak)
print(lint_result)
case "report":
if args.outbreak not in OUTBREAKS:
abort(f"Outbreak not supported: {args.outbreak}")
make_report(
args.outbreak,
args.data or OUTBREAKS[args.outbreak]["url"],
Expand Down
39 changes: 39 additions & 0 deletions src/olm/lint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
olm lint and quality control module
"""

from pathlib import Path

import pandas as pd

from .types import LintResult, RowError
from .outbreaks import read_outbreak, read_schema, get_schema_url

import fastjsonschema


def lint(
outbreak: str,
file: str | None = None,
schema_path: str | Path | None = None,
ignore_fields: list[str] = [],
) -> LintResult:
errors: list[RowError] = []
# do not convert dates as fastjsonschema will check date string representation
df = read_outbreak(outbreak, file, convert_dates=False)
if (schema_url := schema_path or get_schema_url(outbreak)) is None:
raise ValueError("No schema_path passed or schema url found in OUTBREAKS")
schema = read_schema(schema_url)
validator = fastjsonschema.compile(schema)

for row in df.to_dict("records"):
id = row["ID"]
nrow = {
k: v for k, v in row.items() if pd.notnull(v) and k not in ignore_fields
}
try:
validator(nrow)
except fastjsonschema.JsonSchemaValueException as e:
column = e.path[1]
errors.append(RowError(id, column, nrow[column], e.message))
return LintResult(outbreak, str(schema_url), len(errors) == 0, errors)
38 changes: 36 additions & 2 deletions src/olm/outbreaks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
Outbreak configurations
"""

import json
from pathlib import Path
from typing import Any

import requests
import pandas as pd
from ..plots import (
get_counts,
Expand All @@ -14,7 +19,7 @@
plot_delay_distribution,
)
from ..types import OutbreakInfo
from ..util import read_csv
from ..util import read_csv, rename_columns, sort_values
from ..sources import source_databutton


Expand Down Expand Up @@ -66,6 +71,9 @@
"link": "https://worldhealthorg.shinyapps.io/mpx_global/",
"button_text": "Download MPXV clades",
},
rename_columns(
{"country": "Country", "iso3": "ISO3", "clade_status": "Clade status"}
),
),
(
"table/aggregate",
Expand All @@ -74,6 +82,14 @@
"country_col": "Location_Admin0",
"columns": [("Case_status", "confirmed"), ("Outcome", "death")],
},
rename_columns(
{
"Location_Admin0": "Country",
"confirmed": "Confirmed cases",
"death": "Confirmed deaths",
}
),
sort_values("Confirmed cases", ascending=False),
),
(
"data",
Expand Down Expand Up @@ -114,17 +130,34 @@
"description": "Marburg 2023 Equatorial Guinea",
"plots": outbreak_marburg,
"additional_date_columns": ["Data_up_to"],
"schema": "https://raw.githubusercontent.com/globaldothealth/outbreak-schema/main/outbreak.schema.json",
},
"mpox-2024": {
"id": "GHL2024.D11.1E71",
"description": "Mpox 2024",
"plots": outbreak_mpox_2024,
"url": "https://mpox-2024.s3.eu-central-1.amazonaws.com/latest.csv",
"schema": "https://raw.githubusercontent.com/globaldothealth/outbreak-schema/main/GHL2024.D11.1E71.schema.json",
},
}


def read_outbreak(outbreak: str, data_url: str | None = None) -> pd.DataFrame:
def get_schema_url(outbreak: str) -> str | None:
return OUTBREAKS[outbreak].get("schema")


def read_schema(schema: str | Path) -> dict[str, Any]:
"Reads schema from outbreak"
if isinstance(schema, str) and schema.startswith("http"):
if (res := requests.get(schema)).status_code == 200:
return res.json()
else:
return json.loads(Path(schema).read_text())


def read_outbreak(
outbreak: str, data_url: str | None = None, convert_dates: bool = True
) -> pd.DataFrame:
assert outbreak in OUTBREAKS, f"Outbreak {outbreak} not found"
if data_url is None and OUTBREAKS[outbreak].get("url") is None:
raise ValueError(
Expand All @@ -133,6 +166,7 @@ def read_outbreak(outbreak: str, data_url: str | None = None) -> pd.DataFrame:
return read_csv(
data_url or OUTBREAKS[outbreak]["url"],
additional_date_columns=OUTBREAKS[outbreak].get("additional_date_columns", []),
convert_dates=convert_dates,
)


Expand Down
4 changes: 2 additions & 2 deletions src/olm/outbreaks/_header.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!DOCTYPE html>
<html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
Expand All @@ -16,7 +16,7 @@
<body>
<nav>
<ul>
<li><a href="{{ data_url }}">Data</a></li>
<li><a href="{{ data_url }}">Data</a></li>
<li><a href="https://github.com/globaldothealth/outbreak-data/wiki/{{ id }}">GitHub</a></li>
</ul>
<img class="logo" src="https://global.health/wp-content/uploads/2020/07/gh-logo-full-black.png"
Expand Down
11 changes: 6 additions & 5 deletions src/olm/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import plotly.io

from .types import OutbreakInfo
from .util import read_csv, store_s3, invalidate_cache
from .util import read_csv, store_s3, invalidate_cache, msg_ok

TEMPLATES = Path(__file__).parent / "outbreaks"
HEADER = (TEMPLATES / "_header.html").read_text()
Expand Down Expand Up @@ -64,9 +64,10 @@ def make_report(
case "data":
var.update(plot[1](df, **kwargs))
case "table":
var[plot[0].removeprefix("table/")] = plot[1](df, **kwargs).to_html(
index=False
)
table_data = plot[1](df, **kwargs)
for post_processor in plot[3:]:
table_data = post_processor(table_data)
var[plot[0].removeprefix("table/")] = table_data.to_html(index=False)
case "figure":
var.update(
render_figure(
Expand All @@ -76,7 +77,7 @@ def make_report(

report_data = chevron.render(template_text, var)
Path(output_file).write_text(report_data)
print("wrote", output_file)
msg_ok("report", "wrote " + output_file)

if output_bucket:
store_s3(
Expand Down
45 changes: 44 additions & 1 deletion src/olm/types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from typing import Callable, Any, TypedDict, NotRequired
"Types used by olm"

import json
import dataclasses
from typing import Callable, Any, TypedDict, NotRequired, NamedTuple

import plotly.graph_objects as go

PlotFunction = Callable[..., dict[str, Any] | go.Figure]
Expand All @@ -8,6 +13,44 @@
class OutbreakInfo(TypedDict):
id: str
description: str
schema: str
plots: list[tuple[str, Callable[..., Any], dict[str, Any]]]
additional_date_columns: NotRequired[list[str]]
url: NotRequired[str]


class RowError(NamedTuple):
id: str
column: str
value: str
message: str


@dataclasses.dataclass
class LintResult:
outbreak: str
schema: str
ok: bool
errors: list[RowError]

def as_json(self) -> str:
return json.dumps(dataclasses.asdict(self), sort_keys=True, indent=2)

def __str__(self) -> str:
return "\n".join(
f"- ID {e.id}: {e.message}, found={e.value}" for e in self.errors
)

def as_html(self) -> str:
pass

def as_slack(self) -> str:
header = (
"✅ Lint succeeded for " if self.ok else "❌ Lint failed for "
) + f"*{self.outbreak}*"
if self.ok:
return header
errors = "\n".join(
f"- ID {e.id}: {e.message}, found={e.value}" for e in self.errors
)
return header + "\n" + errors
33 changes: 30 additions & 3 deletions src/olm/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import re
import logging
import datetime
from typing import Callable

import boto3

import pandas as pd


Expand Down Expand Up @@ -42,6 +42,30 @@ def non_null_unique(arr: pd.Series) -> pd.Series:
return uniq[~pd.isna(uniq)]


def msg_ok(module: str, s: str):
print(f"\033[0;32m✓ olm[{module}]\t\033[0m {s}")


def msg_fail(module: str, s: str):
print(f"\033[0;31m✗ olm[{module}]\t\033[0m {s}")


def rename_columns(columns: dict[str, str]) -> Callable[[pd.DataFrame], pd.DataFrame]:
def rename_table(df: pd.DataFrame) -> pd.DataFrame:
return df.rename(columns=columns)

return rename_table


def sort_values(
by: list[str], ascending: bool
) -> Callable[[pd.DataFrame], pd.DataFrame]:
def sort_table(df: pd.DataFrame) -> pd.DataFrame:
return df.sort_values(by=by, ascending=ascending)

return sort_table


def fix_datetimes(df: pd.DataFrame, additional_date_columns: list[str] = []):
"Convert date fields to datetime in place"
date_columns = [
Expand Down Expand Up @@ -129,7 +153,9 @@ def invalidate_cache(
raise


def read_csv(filename: str, additional_date_columns: list[str] = []) -> pd.DataFrame:
def read_csv(
filename: str, additional_date_columns: list[str] = [], convert_dates: bool = True
) -> pd.DataFrame:
"""Helper function with post-processing steps after pd.read_csv

Parameters
Expand All @@ -143,5 +169,6 @@ def read_csv(filename: str, additional_date_columns: list[str] = []) -> pd.DataF
or have 'Date ' in their column name
"""
df = pd.read_csv(filename, dtype=str, na_values=["N/K", "NK"])
fix_datetimes(df, additional_date_columns)
if convert_dates:
fix_datetimes(df, additional_date_columns)
return df
Loading