From c3b4bfea4394d2e22a49ad89d8f4a7132ef21e83 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 13:46:54 +0100 Subject: [PATCH 01/10] lint --- src/olm/lint.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/olm/lint.py diff --git a/src/olm/lint.py b/src/olm/lint.py new file mode 100644 index 0000000..b0b5b9a --- /dev/null +++ b/src/olm/lint.py @@ -0,0 +1,45 @@ +""" +olm lint and quality control module +""" + +import json +import dataclasses +from typing import NamedTuple + + +from .outbreaks import read_outbreak + + +class RowError(NamedTuple): + id: str + column: str + value: str + message: str + + +@dataclasses.dataclass +class LintResult: + outbreak: str + schema: str + filehash: str + ok: bool + errors: list[RowError] + + def as_json(self) -> str: + return json.dumps(dataclasses.asdict(self), sort_keys=True, indent=2) + + def as_html(self) -> str: + pass + + def as_slack(self) -> str: + pass + + +def lint(outbreak: str, file: str | None = None) -> LintResult: + errors = [] + schema = None + df = read_outbreak(outbreak, file) + for row in df.to_dict("records"): + # lint each row + pass + return LintResult(outbreak, schema, "", len(errors) == 0, errors) From 12abcc7f68822dc24a7e0f1e901d8ba9cd48eaa0 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 20:35:28 +0100 Subject: [PATCH 02/10] pyproject.toml: add fastjsonschema --- pyproject.toml | 1 + uv.lock | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e685aea..8982813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "plotly>=5.23.0", "boto3>=1.35.8", "selenium>=4.24.0", + "fastjsonschema>=2.20.0", ] scripts = { olm = "olm:main" } diff --git a/uv.lock b/uv.lock index daad0e5..566be2e 100644 --- a/uv.lock +++ b/uv.lock @@ -1,7 +1,6 @@ version = 1 requires-python = ">=3.11" resolution-markers = [ - "python_full_version < '0'", "python_full_version < '3.12'", "python_full_version >= '3.12'", ] @@ -140,6 +139,15 @@ toml = [ { name = "tomli", marker = "python_full_version == '3.11'" }, ] +[[package]] +name = "fastjsonschema" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/3f/3ad5e7be13b4b8b55f4477141885ab2364f65d5f6ad5f7a9daffd634d066/fastjsonschema-2.20.0.tar.gz", hash = "sha256:3d48fc5300ee96f5d116f10fe6f28d938e6008f59a6a025c2649475b87f76a23", size = 373056 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/ca/086311cdfc017ec964b2436fe0c98c1f4efcb7e4c328956a22456e497655/fastjsonschema-2.20.0-py3-none-any.whl", hash = "sha256:5875f0b0fa7a0043a91e93a9b8f793bcbbba9691e7fd83dca95c28ba26d21f0a", size = 23543 }, +] + [[package]] name = "h11" version = "0.14.0" @@ -228,6 +236,7 @@ source = { editable = "." } dependencies = [ { name = "boto3" }, { name = "chevron" }, + { name = "fastjsonschema" }, { name = "pandas" }, { name = "plotly" }, { name = "python-dateutil" }, @@ -244,6 +253,7 @@ dev = [ requires-dist = [ { name = "boto3", specifier = ">=1.35.8" }, { name = "chevron", specifier = ">=0.14.0" }, + { name = "fastjsonschema", specifier = ">=2.20.0" }, { name = "pandas", specifier = ">=2.2.2" }, { name = "plotly", specifier = ">=5.23.0" }, { name = "python-dateutil", specifier = ">=2.9.0.post0" }, From 76b578da74caa167a8cefc0865982b44a738693f Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 22:00:48 +0100 Subject: [PATCH 03/10] lint update --- pyproject.toml | 1 + src/olm/__init__.py | 30 ++++++++++++++----- src/olm/lint.py | 48 ++++++++++-------------------- src/olm/outbreaks/__init__.py | 21 ++++++++++++- src/olm/types.py | 33 ++++++++++++++++++++- src/olm/util.py | 7 +++-- uv.lock | 56 +++++++++++++++++++++++++++++++++++ 7 files changed, 152 insertions(+), 44 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8982813..b97bbcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "boto3>=1.35.8", "selenium>=4.24.0", "fastjsonschema>=2.20.0", + "requests>=2.32.3", ] scripts = { olm = "olm:main" } diff --git a/src/olm/__init__.py b/src/olm/__init__.py index 151e9c7..53ac841 100644 --- a/src/olm/__init__.py +++ b/src/olm/__init__.py @@ -1,9 +1,12 @@ import sys import argparse import webbrowser -import urllib from pathlib import Path + +import requests + from .report import make_report +from .lint import lint from .outbreaks import OUTBREAKS USAGE = """olm: Office for Linelist Management @@ -35,10 +38,18 @@ def main(): subparsers = parser.add_subparsers(dest="command") - report_parser = subparsers.add_parser("report", help="Generate briefing report") + lint_parser = subparsers.add_parser( + "lint", help="Lint outbreak data according to schema" + ) + lint_parser.add_argument("outbreak", help="Outbreak name") + lint_parser.add_argument("--data", help="Data URL") + get_parser = subparsers.add_parser("get", help="Get data for outbreak") get_parser.add_argument("outbreak", help="Outbreak name") + _ = subparsers.add_parser("list", help="List outbreaks managed by olm") + + report_parser = subparsers.add_parser("report", help="Generate briefing report") report_parser.add_argument("outbreak", help="Outbreak name") report_parser.add_argument("--data", help="Data URL") report_parser.add_argument( @@ -52,6 +63,9 @@ def main(): ) args = parser.parse_args() + if args.outbreak is not None and args.outbreak not in OUTBREAKS: + abort("Outbreak not known. Choose from: " + ", ".join(OUTBREAKS)) + match args.command: case "list": for outbreak in OUTBREAKS: @@ -59,17 +73,17 @@ def main(): f"\033[1m{outbreak:12s} \033[0m{OUTBREAKS[outbreak]['description']} [{OUTBREAKS[outbreak]['id']}]" ) case "get": - if args.outbreak not in OUTBREAKS: - abort("Outbreak not known. Choose from: " + ", ".join(OUTBREAKS)) if "url" not in OUTBREAKS[args.outbreak]: abort(f"No data URL found for: {args.outbreak}") output_file = f"{args.outbreak}.csv" - with urllib.request.urlopen(OUTBREAKS[args.outbreak]["url"]) as f: - Path(output_file).write_bytes(f.read()) + if ( + res := requests.get(OUTBREAKS[args.outbreak]["url"]) + ).status_code == 200: + Path(output_file).write_text(res.text) print("wrote", output_file) + case "lint": + lint(args.outbreak, args.data) case "report": - if args.outbreak not in OUTBREAKS: - abort(f"Outbreak not supported: {args.outbreak}") make_report( args.outbreak, args.data or OUTBREAKS[args.outbreak]["url"], diff --git a/src/olm/lint.py b/src/olm/lint.py index b0b5b9a..7b36da6 100644 --- a/src/olm/lint.py +++ b/src/olm/lint.py @@ -2,44 +2,28 @@ olm lint and quality control module """ -import json -import dataclasses -from typing import NamedTuple +from pathlib import Path +import pandas as pd -from .outbreaks import read_outbreak +from .types import LintResult +from .outbreaks import read_outbreak, read_schema - -class RowError(NamedTuple): - id: str - column: str - value: str - message: str - - -@dataclasses.dataclass -class LintResult: - outbreak: str - schema: str - filehash: str - ok: bool - errors: list[RowError] - - def as_json(self) -> str: - return json.dumps(dataclasses.asdict(self), sort_keys=True, indent=2) - - def as_html(self) -> str: - pass - - def as_slack(self) -> str: - pass +import fastjsonschema def lint(outbreak: str, file: str | None = None) -> LintResult: errors = [] - schema = None - df = read_outbreak(outbreak, file) + # do not convert dates as fastjsonschema will check date string representation + df = read_outbreak(outbreak, file, convert_dates=False) + schema = read_schema(Path("GHL2024.D11.1E71.schema.json")) + validator = fastjsonschema.compile(schema) + for row in df.to_dict("records"): - # lint each row - pass + id = row["ID"] + nrow = {k: v for k, v in row.items() if pd.notnull(v)} + try: + validator(nrow) + except fastjsonschema.JsonSchemaValueException as e: + print(f"ID {id}: {e}, found: {nrow[e.path[1]]}") return LintResult(outbreak, schema, "", len(errors) == 0, errors) diff --git a/src/olm/outbreaks/__init__.py b/src/olm/outbreaks/__init__.py index c93a021..87c16c8 100644 --- a/src/olm/outbreaks/__init__.py +++ b/src/olm/outbreaks/__init__.py @@ -2,6 +2,11 @@ Outbreak configurations """ +import json +from pathlib import Path +from typing import Any + +import requests import pandas as pd from ..plots import ( get_counts, @@ -120,11 +125,24 @@ "description": "Mpox 2024", "plots": outbreak_mpox_2024, "url": "https://mpox-2024.s3.eu-central-1.amazonaws.com/latest.csv", + "schema": "GHL2024.D11.1E71.schema.json", }, } -def read_outbreak(outbreak: str, data_url: str | None = None) -> pd.DataFrame: +def read_schema(outbreak_or_schema: str | Path) -> dict[str, Any]: + "Reads schema from outbreak" + if isinstance(outbreak_or_schema, Path): + return json.loads(outbreak_or_schema.read_text()) + schema = OUTBREAKS[outbreak_or_schema]["schema"] + if schema.startswith("http"): + if (res := requests.get(schema)).status_code == 200: + return res.json() + + +def read_outbreak( + outbreak: str, data_url: str | None = None, convert_dates: bool = True +) -> pd.DataFrame: assert outbreak in OUTBREAKS, f"Outbreak {outbreak} not found" if data_url is None and OUTBREAKS[outbreak].get("url") is None: raise ValueError( @@ -133,6 +151,7 @@ def read_outbreak(outbreak: str, data_url: str | None = None) -> pd.DataFrame: return read_csv( data_url or OUTBREAKS[outbreak]["url"], additional_date_columns=OUTBREAKS[outbreak].get("additional_date_columns", []), + convert_dates=convert_dates, ) diff --git a/src/olm/types.py b/src/olm/types.py index 419319e..eccb07e 100644 --- a/src/olm/types.py +++ b/src/olm/types.py @@ -1,4 +1,9 @@ -from typing import Callable, Any, TypedDict, NotRequired +"Types used by olm" + +import json +import dataclasses +from typing import Callable, Any, TypedDict, NotRequired, NamedTuple + import plotly.graph_objects as go PlotFunction = Callable[..., dict[str, Any] | go.Figure] @@ -8,6 +13,32 @@ class OutbreakInfo(TypedDict): id: str description: str + schema: str plots: list[tuple[str, Callable[..., Any], dict[str, Any]]] additional_date_columns: NotRequired[list[str]] url: NotRequired[str] + + +class RowError(NamedTuple): + id: str + column: str + value: str + message: str + + +@dataclasses.dataclass +class LintResult: + outbreak: str + schema: str + filehash: str + ok: bool + errors: list[RowError] + + def as_json(self) -> str: + return json.dumps(dataclasses.asdict(self), sort_keys=True, indent=2) + + def as_html(self) -> str: + pass + + def as_slack(self) -> str: + pass diff --git a/src/olm/util.py b/src/olm/util.py index 9d7b5d6..2c07074 100644 --- a/src/olm/util.py +++ b/src/olm/util.py @@ -129,7 +129,9 @@ def invalidate_cache( raise -def read_csv(filename: str, additional_date_columns: list[str] = []) -> pd.DataFrame: +def read_csv( + filename: str, additional_date_columns: list[str] = [], convert_dates: bool = True +) -> pd.DataFrame: """Helper function with post-processing steps after pd.read_csv Parameters @@ -143,5 +145,6 @@ def read_csv(filename: str, additional_date_columns: list[str] = []) -> pd.DataF or have 'Date ' in their column name """ df = pd.read_csv(filename, dtype=str, na_values=["N/K", "NK"]) - fix_datetimes(df, additional_date_columns) + if convert_dates: + fix_datetimes(df, additional_date_columns) return df diff --git a/uv.lock b/uv.lock index 566be2e..de02b20 100644 --- a/uv.lock +++ b/uv.lock @@ -68,6 +68,45 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 }, ] +[[package]] +name = "charset-normalizer" +version = "3.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/63/09/c1bc53dab74b1816a00d8d030de5bf98f724c52c1635e07681d312f20be8/charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5", size = 104809 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/77/02839016f6fbbf808e8b38601df6e0e66c17bbab76dff4613f7511413597/charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db", size = 191647 }, + { url = "https://files.pythonhosted.org/packages/3e/33/21a875a61057165e92227466e54ee076b73af1e21fe1b31f1e292251aa1e/charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96", size = 121434 }, + { url = "https://files.pythonhosted.org/packages/dd/51/68b61b90b24ca35495956b718f35a9756ef7d3dd4b3c1508056fa98d1a1b/charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e", size = 118979 }, + { url = "https://files.pythonhosted.org/packages/e4/a6/7ee57823d46331ddc37dd00749c95b0edec2c79b15fc0d6e6efb532e89ac/charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f", size = 136582 }, + { url = "https://files.pythonhosted.org/packages/74/f1/0d9fe69ac441467b737ba7f48c68241487df2f4522dd7246d9426e7c690e/charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574", size = 146645 }, + { url = "https://files.pythonhosted.org/packages/05/31/e1f51c76db7be1d4aef220d29fbfa5dbb4a99165d9833dcbf166753b6dc0/charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4", size = 139398 }, + { url = "https://files.pythonhosted.org/packages/40/26/f35951c45070edc957ba40a5b1db3cf60a9dbb1b350c2d5bef03e01e61de/charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8", size = 140273 }, + { url = "https://files.pythonhosted.org/packages/07/07/7e554f2bbce3295e191f7e653ff15d55309a9ca40d0362fcdab36f01063c/charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc", size = 142577 }, + { url = "https://files.pythonhosted.org/packages/d8/b5/eb705c313100defa57da79277d9207dc8d8e45931035862fa64b625bfead/charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae", size = 137747 }, + { url = "https://files.pythonhosted.org/packages/19/28/573147271fd041d351b438a5665be8223f1dd92f273713cb882ddafe214c/charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887", size = 143375 }, + { url = "https://files.pythonhosted.org/packages/cf/7c/f3b682fa053cc21373c9a839e6beba7705857075686a05c72e0f8c4980ca/charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae", size = 148474 }, + { url = "https://files.pythonhosted.org/packages/1e/49/7ab74d4ac537ece3bc3334ee08645e231f39f7d6df6347b29a74b0537103/charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce", size = 140232 }, + { url = "https://files.pythonhosted.org/packages/2d/dc/9dacba68c9ac0ae781d40e1a0c0058e26302ea0660e574ddf6797a0347f7/charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f", size = 140859 }, + { url = "https://files.pythonhosted.org/packages/6c/c2/4a583f800c0708dd22096298e49f887b49d9746d0e78bfc1d7e29816614c/charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab", size = 92509 }, + { url = "https://files.pythonhosted.org/packages/57/ec/80c8d48ac8b1741d5b963797b7c0c869335619e13d4744ca2f67fc11c6fc/charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77", size = 99870 }, + { url = "https://files.pythonhosted.org/packages/d1/b2/fcedc8255ec42afee97f9e6f0145c734bbe104aac28300214593eb326f1d/charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8", size = 192892 }, + { url = "https://files.pythonhosted.org/packages/2e/7d/2259318c202f3d17f3fe6438149b3b9e706d1070fe3fcbb28049730bb25c/charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b", size = 122213 }, + { url = "https://files.pythonhosted.org/packages/3a/52/9f9d17c3b54dc238de384c4cb5a2ef0e27985b42a0e5cc8e8a31d918d48d/charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6", size = 119404 }, + { url = "https://files.pythonhosted.org/packages/99/b0/9c365f6d79a9f0f3c379ddb40a256a67aa69c59609608fe7feb6235896e1/charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a", size = 137275 }, + { url = "https://files.pythonhosted.org/packages/91/33/749df346e93d7a30cdcb90cbfdd41a06026317bfbfb62cd68307c1a3c543/charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389", size = 147518 }, + { url = "https://files.pythonhosted.org/packages/72/1a/641d5c9f59e6af4c7b53da463d07600a695b9824e20849cb6eea8a627761/charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa", size = 140182 }, + { url = "https://files.pythonhosted.org/packages/ee/fb/14d30eb4956408ee3ae09ad34299131fb383c47df355ddb428a7331cfa1e/charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b", size = 141869 }, + { url = "https://files.pythonhosted.org/packages/df/3e/a06b18788ca2eb6695c9b22325b6fde7dde0f1d1838b1792a0076f58fe9d/charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed", size = 144042 }, + { url = "https://files.pythonhosted.org/packages/45/59/3d27019d3b447a88fe7e7d004a1e04be220227760264cc41b405e863891b/charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26", size = 138275 }, + { url = "https://files.pythonhosted.org/packages/7b/ef/5eb105530b4da8ae37d506ccfa25057961b7b63d581def6f99165ea89c7e/charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d", size = 144819 }, + { url = "https://files.pythonhosted.org/packages/a2/51/e5023f937d7f307c948ed3e5c29c4b7a3e42ed2ee0b8cdf8f3a706089bf0/charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068", size = 149415 }, + { url = "https://files.pythonhosted.org/packages/24/9d/2e3ef673dfd5be0154b20363c5cdcc5606f35666544381bee15af3778239/charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143", size = 141212 }, + { url = "https://files.pythonhosted.org/packages/5b/ae/ce2c12fcac59cb3860b2e2d76dc405253a4475436b1861d95fe75bdea520/charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4", size = 142167 }, + { url = "https://files.pythonhosted.org/packages/ed/3a/a448bf035dce5da359daf9ae8a16b8a39623cc395a2ffb1620aa1bce62b0/charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7", size = 93041 }, + { url = "https://files.pythonhosted.org/packages/b6/7c/8debebb4f90174074b827c63242c23851bdf00a532489fba57fef3416e40/charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001", size = 100397 }, + { url = "https://files.pythonhosted.org/packages/28/76/e6222113b83e3622caa4bb41032d0b1bf785250607392e1b778aca0b8a7d/charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc", size = 48543 }, +] + [[package]] name = "chevron" version = "0.14.0" @@ -240,6 +279,7 @@ dependencies = [ { name = "pandas" }, { name = "plotly" }, { name = "python-dateutil" }, + { name = "requests" }, { name = "selenium" }, ] @@ -257,6 +297,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.2.2" }, { name = "plotly", specifier = ">=5.23.0" }, { name = "python-dateutil", specifier = ">=2.9.0.post0" }, + { name = "requests", specifier = ">=2.32.3" }, { name = "selenium", specifier = ">=4.24.0" }, ] @@ -404,6 +445,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/3d/a121f284241f08268b21359bd425f7d4825cffc5ac5cd0e1b3d82ffd2b10/pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319", size = 505474 }, ] +[[package]] +name = "requests" +version = "2.32.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, +] + [[package]] name = "s3transfer" version = "0.10.2" From 4e9ac9117ac9f0a5f17405cf8b6c6f71e3c87429 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 22:41:14 +0100 Subject: [PATCH 04/10] lint update and pretty print --- src/olm/__init__.py | 2 +- src/olm/lint.py | 19 ++++++++++++------- src/olm/outbreaks/__init__.py | 13 ++++++++----- src/olm/types.py | 22 ++++++++++++++++++++-- src/olm/util.py | 1 - 5 files changed, 41 insertions(+), 16 deletions(-) diff --git a/src/olm/__init__.py b/src/olm/__init__.py index 53ac841..a34bcba 100644 --- a/src/olm/__init__.py +++ b/src/olm/__init__.py @@ -82,7 +82,7 @@ def main(): Path(output_file).write_text(res.text) print("wrote", output_file) case "lint": - lint(args.outbreak, args.data) + print(lint(args.outbreak, args.data)) case "report": make_report( args.outbreak, diff --git a/src/olm/lint.py b/src/olm/lint.py index 7b36da6..820f0a6 100644 --- a/src/olm/lint.py +++ b/src/olm/lint.py @@ -6,17 +6,21 @@ import pandas as pd -from .types import LintResult -from .outbreaks import read_outbreak, read_schema +from .types import LintResult, RowError +from .outbreaks import read_outbreak, read_schema, get_schema_url import fastjsonschema -def lint(outbreak: str, file: str | None = None) -> LintResult: - errors = [] +def lint( + outbreak: str, file: str | None = None, schema_path: str | Path | None = None +) -> LintResult: + errors: list[RowError] = [] # do not convert dates as fastjsonschema will check date string representation df = read_outbreak(outbreak, file, convert_dates=False) - schema = read_schema(Path("GHL2024.D11.1E71.schema.json")) + if (schema_url := schema_path or get_schema_url(outbreak)) is None: + raise ValueError("No schema_path passed or schema url found in OUTBREAKS") + schema = read_schema(schema_url) validator = fastjsonschema.compile(schema) for row in df.to_dict("records"): @@ -25,5 +29,6 @@ def lint(outbreak: str, file: str | None = None) -> LintResult: try: validator(nrow) except fastjsonschema.JsonSchemaValueException as e: - print(f"ID {id}: {e}, found: {nrow[e.path[1]]}") - return LintResult(outbreak, schema, "", len(errors) == 0, errors) + column = e.path[1] + errors.append(RowError(id, column, nrow[column], e.message)) + return LintResult(outbreak, str(schema_url), len(errors) == 0, errors) diff --git a/src/olm/outbreaks/__init__.py b/src/olm/outbreaks/__init__.py index 87c16c8..a1f2fbd 100644 --- a/src/olm/outbreaks/__init__.py +++ b/src/olm/outbreaks/__init__.py @@ -130,14 +130,17 @@ } -def read_schema(outbreak_or_schema: str | Path) -> dict[str, Any]: +def get_schema_url(outbreak: str) -> str | None: + return OUTBREAKS[outbreak].get("schema") + + +def read_schema(schema: str | Path) -> dict[str, Any]: "Reads schema from outbreak" - if isinstance(outbreak_or_schema, Path): - return json.loads(outbreak_or_schema.read_text()) - schema = OUTBREAKS[outbreak_or_schema]["schema"] - if schema.startswith("http"): + if isinstance(schema, str) and schema.startswith("http"): if (res := requests.get(schema)).status_code == 200: return res.json() + else: + return json.loads(Path(schema).read_text()) def read_outbreak( diff --git a/src/olm/types.py b/src/olm/types.py index eccb07e..f3b9062 100644 --- a/src/olm/types.py +++ b/src/olm/types.py @@ -30,15 +30,33 @@ class RowError(NamedTuple): class LintResult: outbreak: str schema: str - filehash: str ok: bool errors: list[RowError] def as_json(self) -> str: return json.dumps(dataclasses.asdict(self), sort_keys=True, indent=2) + def __str__(self) -> str: + header = ( + "✅ Lint succeeded for " if self.ok else "❌ Lint failed for " + ) + f"\033[1m{self.outbreak}\033[0m\n" + if self.ok: + return header + errors = "\n".join( + f"- ID {e.id}: {e.message}, found={e.value}" for e in self.errors + ) + return header + errors + def as_html(self) -> str: pass def as_slack(self) -> str: - pass + header = ( + "✅ Lint succeeded for " if self.ok else "❌ Lint failed for " + ) + f"*{self.outbreak}*" + if self.ok: + return header + errors = "\n".join( + f"- ID {e.id}: {e.message}, found={e.value}" for e in self.errors + ) + return header + "\n" + errors diff --git a/src/olm/util.py b/src/olm/util.py index 2c07074..9799eec 100644 --- a/src/olm/util.py +++ b/src/olm/util.py @@ -7,7 +7,6 @@ import datetime import boto3 - import pandas as pd From f1ae27f69d87e4940f89941783b10191ff2101da Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 22:52:49 +0100 Subject: [PATCH 05/10] lint update --- src/olm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/olm/__init__.py b/src/olm/__init__.py index a34bcba..83d1f3d 100644 --- a/src/olm/__init__.py +++ b/src/olm/__init__.py @@ -63,7 +63,7 @@ def main(): ) args = parser.parse_args() - if args.outbreak is not None and args.outbreak not in OUTBREAKS: + if args.command and args.command != "list" and args.outbreak not in OUTBREAKS: abort("Outbreak not known. Choose from: " + ", ".join(OUTBREAKS)) match args.command: From b2dd79cbf0af306ece05c6f2e260cff84d485bf3 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 23:14:03 +0100 Subject: [PATCH 06/10] cli: coloured output for success and failure --- src/olm/__init__.py | 20 +++++++++++++++----- src/olm/report.py | 4 ++-- src/olm/types.py | 8 +------- src/olm/util.py | 8 ++++++++ 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/olm/__init__.py b/src/olm/__init__.py index 83d1f3d..a102209 100644 --- a/src/olm/__init__.py +++ b/src/olm/__init__.py @@ -8,6 +8,7 @@ from .report import make_report from .lint import lint from .outbreaks import OUTBREAKS +from .util import msg_ok, msg_fail USAGE = """olm: Office for Linelist Management @@ -27,7 +28,7 @@ def abort(msg): - print(msg) + msg_fail("cli", msg) sys.exit(1) @@ -64,7 +65,11 @@ def main(): args = parser.parse_args() if args.command and args.command != "list" and args.outbreak not in OUTBREAKS: - abort("Outbreak not known. Choose from: " + ", ".join(OUTBREAKS)) + abort( + "outbreak not known, choose from: \033[1m" + + ", ".join(OUTBREAKS) + + "\033[0m" + ) match args.command: case "list": @@ -74,15 +79,20 @@ def main(): ) case "get": if "url" not in OUTBREAKS[args.outbreak]: - abort(f"No data URL found for: {args.outbreak}") + abort(f"no data URL found for \033[1m{args.outbreak}\033[0m") output_file = f"{args.outbreak}.csv" if ( res := requests.get(OUTBREAKS[args.outbreak]["url"]) ).status_code == 200: Path(output_file).write_text(res.text) - print("wrote", output_file) + msg_ok("get", "wrote " + output_file) case "lint": - print(lint(args.outbreak, args.data)) + lint_result = lint(args.outbreak, args.data) + if lint_result.ok: + msg_ok("lint", "succeeded for " + args.outbreak) + else: + msg_fail("lint", "failed for " + args.outbreak) + print(lint_result) case "report": make_report( args.outbreak, diff --git a/src/olm/report.py b/src/olm/report.py index de4bf81..964f390 100644 --- a/src/olm/report.py +++ b/src/olm/report.py @@ -9,7 +9,7 @@ import plotly.io from .types import OutbreakInfo -from .util import read_csv, store_s3, invalidate_cache +from .util import read_csv, store_s3, invalidate_cache, msg_ok TEMPLATES = Path(__file__).parent / "outbreaks" HEADER = (TEMPLATES / "_header.html").read_text() @@ -76,7 +76,7 @@ def make_report( report_data = chevron.render(template_text, var) Path(output_file).write_text(report_data) - print("wrote", output_file) + msg_ok("report", "wrote " + output_file) if output_bucket: store_s3( diff --git a/src/olm/types.py b/src/olm/types.py index f3b9062..a0eaa7d 100644 --- a/src/olm/types.py +++ b/src/olm/types.py @@ -37,15 +37,9 @@ def as_json(self) -> str: return json.dumps(dataclasses.asdict(self), sort_keys=True, indent=2) def __str__(self) -> str: - header = ( - "✅ Lint succeeded for " if self.ok else "❌ Lint failed for " - ) + f"\033[1m{self.outbreak}\033[0m\n" - if self.ok: - return header - errors = "\n".join( + return "\n".join( f"- ID {e.id}: {e.message}, found={e.value}" for e in self.errors ) - return header + errors def as_html(self) -> str: pass diff --git a/src/olm/util.py b/src/olm/util.py index 9799eec..232060e 100644 --- a/src/olm/util.py +++ b/src/olm/util.py @@ -41,6 +41,14 @@ def non_null_unique(arr: pd.Series) -> pd.Series: return uniq[~pd.isna(uniq)] +def msg_ok(module: str, s: str): + print(f"\033[0;32m✓ olm[{module}]\t\033[0m {s}") + + +def msg_fail(module: str, s: str): + print(f"\033[0;31m✗ olm[{module}]\t\033[0m {s}") + + def fix_datetimes(df: pd.DataFrame, additional_date_columns: list[str] = []): "Convert date fields to datetime in place" date_columns = [ From 198af2aca4461dd0ffbcb8a14eaf4e1797dc8fbf Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 23:16:45 +0100 Subject: [PATCH 07/10] lint: add --schema option --- src/olm/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/olm/__init__.py b/src/olm/__init__.py index a102209..59e2b65 100644 --- a/src/olm/__init__.py +++ b/src/olm/__init__.py @@ -44,6 +44,7 @@ def main(): ) lint_parser.add_argument("outbreak", help="Outbreak name") lint_parser.add_argument("--data", help="Data URL") + lint_parser.add_argument("--schema", help="Data schema path or URL") get_parser = subparsers.add_parser("get", help="Get data for outbreak") get_parser.add_argument("outbreak", help="Outbreak name") @@ -87,7 +88,7 @@ def main(): Path(output_file).write_text(res.text) msg_ok("get", "wrote " + output_file) case "lint": - lint_result = lint(args.outbreak, args.data) + lint_result = lint(args.outbreak, args.data, args.schema) if lint_result.ok: msg_ok("lint", "succeeded for " + args.outbreak) else: From 3e1659ef6f9f102eef46da983a17398925ccab22 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Sun, 8 Sep 2024 23:58:16 +0100 Subject: [PATCH 08/10] lint: add --ignore --- src/olm/__init__.py | 14 +++++++++----- src/olm/lint.py | 9 +++++++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/olm/__init__.py b/src/olm/__init__.py index 59e2b65..1518759 100644 --- a/src/olm/__init__.py +++ b/src/olm/__init__.py @@ -45,6 +45,7 @@ def main(): lint_parser.add_argument("outbreak", help="Outbreak name") lint_parser.add_argument("--data", help="Data URL") lint_parser.add_argument("--schema", help="Data schema path or URL") + lint_parser.add_argument("--ignore", help="Ignore fields, comma-separated") get_parser = subparsers.add_parser("get", help="Get data for outbreak") get_parser.add_argument("outbreak", help="Outbreak name") @@ -71,6 +72,7 @@ def main(): + ", ".join(OUTBREAKS) + "\033[0m" ) + bold_outbreak = f"\033[1m{args.outbreak}\033[0m" match args.command: case "list": @@ -80,7 +82,7 @@ def main(): ) case "get": if "url" not in OUTBREAKS[args.outbreak]: - abort(f"no data URL found for \033[1m{args.outbreak}\033[0m") + abort(f"no data URL found for {bold_outbreak}") output_file = f"{args.outbreak}.csv" if ( res := requests.get(OUTBREAKS[args.outbreak]["url"]) @@ -88,11 +90,13 @@ def main(): Path(output_file).write_text(res.text) msg_ok("get", "wrote " + output_file) case "lint": - lint_result = lint(args.outbreak, args.data, args.schema) - if lint_result.ok: - msg_ok("lint", "succeeded for " + args.outbreak) + ignore_keys = args.ignore.split(",") if args.ignore is not None else [] + if ( + lint_result := lint(args.outbreak, args.data, args.schema, ignore_keys) + ).ok: + msg_ok("lint", "succeeded for " + bold_outbreak) else: - msg_fail("lint", "failed for " + args.outbreak) + msg_fail("lint", "failed for " + bold_outbreak) print(lint_result) case "report": make_report( diff --git a/src/olm/lint.py b/src/olm/lint.py index 820f0a6..dfeed54 100644 --- a/src/olm/lint.py +++ b/src/olm/lint.py @@ -13,7 +13,10 @@ def lint( - outbreak: str, file: str | None = None, schema_path: str | Path | None = None + outbreak: str, + file: str | None = None, + schema_path: str | Path | None = None, + ignore_fields: list[str] = [], ) -> LintResult: errors: list[RowError] = [] # do not convert dates as fastjsonschema will check date string representation @@ -25,7 +28,9 @@ def lint( for row in df.to_dict("records"): id = row["ID"] - nrow = {k: v for k, v in row.items() if pd.notnull(v)} + nrow = { + k: v for k, v in row.items() if pd.notnull(v) and k not in ignore_fields + } try: validator(nrow) except fastjsonschema.JsonSchemaValueException as e: From 5c1ba3dab17006396b086307af890291828569c5 Mon Sep 17 00:00:00 2001 From: Abhishek Dasgupta Date: Mon, 9 Sep 2024 00:20:20 +0100 Subject: [PATCH 09/10] outbreaks(header): add down arrow icon --- src/olm/outbreaks/_header.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/olm/outbreaks/_header.html b/src/olm/outbreaks/_header.html index 0ad4f5b..5d49dda 100644 --- a/src/olm/outbreaks/_header.html +++ b/src/olm/outbreaks/_header.html @@ -16,7 +16,7 @@