From b22e5c3c4bed20f09834bfe0dd67fd1b4cf3abaf Mon Sep 17 00:00:00 2001 From: Gavin Date: Fri, 24 Dec 2021 13:30:00 -0800 Subject: [PATCH 1/5] Add import spec writers for xsv files Will eventually be hooked up to a service endpoint to allow the narrative to generate import templates on command --- .gitignore | 3 +- .../import_specifications/file_writers.py | 175 ++++++++++++++ .../test_file_writers.py | 228 ++++++++++++++++++ 3 files changed, 405 insertions(+), 1 deletion(-) create mode 100644 staging_service/import_specifications/file_writers.py create mode 100644 tests/import_specifications/test_file_writers.py diff --git a/.gitignore b/.gitignore index f0be0c6..00b31a5 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,5 @@ ENV/ data/ .DS_Store .virtualenvs/ -test.env \ No newline at end of file +test.env +run_tests_single.sh \ No newline at end of file diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py new file mode 100644 index 0000000..13a5b50 --- /dev/null +++ b/staging_service/import_specifications/file_writers.py @@ -0,0 +1,175 @@ +""" +Write an import specification to one or more files. + +The names of the files will be the datatype suffixed by the file extension unless the writer +handles Excel or similar files that can contain multiple datatypes, in which case the +file name will be {IMPORT_SPEC_FILE_NAME} suffixed by the extension. + +All the write_* functions in this module have the same function signature: + +:param folder: Where the files should be written. The folder must exist. +:param types: The import specifications to write. This is a dictionary of data types as strings + to the specifications for the data type. Each specification has two required keys: + * `order_and_titles`: this is a list of tuples. Each iner tuple has two elements: + * The parameter ID of a parameter. This is typically the `id` field from the + KBase app `spec.json` file. + * The display name of the parameter. This is typically the `ui-name` field from the + KBase app `display.yaml` file. + The order of the inner tuples in the outer tuple defines the order of the columns + in the resulting import specification files. + * `data`: this is a list of str->str dicts. The keys of the dicts are the parameter IDs + as described above, while the values are the values of the parameters. Each dict + must have exactly the same keys as the `order_and_titles` structure. Each entry in the + list corresponds to a row in the resulting import specification, and the order of + the list defines the order of the rows. + Leave the `data` list empty to write an empty template. +:returns: A mapping of the data types to the files to which they were written. +""" + + +import collections +import csv +import numbers + +from typing import Any +from pathlib import Path + +# this version is synonymous to the versions in individual_parsers.py. However, this module +# should only ever write the most recent format for import specifictions, while the parsers +# may need to also be able to parse earlier versions. +_VERSION = 1 + +# these are the same as in individual_parsers.py. They might change from version to version so +# have a separate copy here. +_DATA_TYPE = "Data type:" +_VERSION_STR = "Version:" +_COLUMN_STR = "Columns:" +_HEADER_SEP = ";" + +_IMPORT_SPEC_FILE_NAME = "import_specification" + +_ORDER_AND_DISPLAY = "order_and_display" +_DATA = "data" +_EXT_CSV = "csv" +_EXT_TSV = "tsv" +_EXT_EXCEL = "xlsx" +_SEP_CSV = "," +_SEP_TSV = "\t" + +def _check_import_specification(types: dict[str, dict[str, list[Any]]]): + f""" + Check the structure of an import specification data structure. If the input is empty the + result is a noop. + + :param types: The import specifications to check. This is a dictionary of data types as strings + to the specifications for the data type. Each specification has two required keys: + * {_ORDER_AND_DISPLAY}: this is a list of lists. Each inner list has two elements: + * The parameter ID of a parameter. This is typically the `id` field from the + KBase app `spec.json` file. + * The display name of the parameter. This is typically the `ui-name` field from the + KBase app `display.yaml` file. + The order of the inner lists in the outer list defines the order of the columns + in the resulting import specification files. + * {_DATA}: this is a list of str->str dicts. The keys of the dicts are the parameter IDs + as described above, while the values are the values of the parameters. Each dict + must have exactly the same keys as the {_ORDER_AND_DISPLAY} structure. Each entry in the + list corresponds to a row in the resulting import specification, and the order of + the list defines the order of the rows. + Leave the {_DATA} list empty to write an empty template. + """ + if not types: + return + for datatype in types: + # replace this with jsonschema? don't worry about it for now + _check_string(datatype, "A data type") + spec = types[datatype] + if type(spec) != dict: + raise ValueError(f"The value for data type {datatype} must be a mapping") + if _ORDER_AND_DISPLAY not in spec: + raise ValueError(f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key") + _check_is_sequence( + spec[_ORDER_AND_DISPLAY], f"Data type {datatype} {_ORDER_AND_DISPLAY} value") + if not len(spec[_ORDER_AND_DISPLAY]): + raise ValueError(f"At least one entry is required for {_ORDER_AND_DISPLAY} " + + f"for type {datatype}") + if _DATA not in spec: + raise ValueError(f"Data type {datatype} missing {_DATA} key") + _check_is_sequence(spec[_DATA], f"Data type {datatype} {_DATA} value") + + param_ids = set() + for i, id_display in enumerate(spec[_ORDER_AND_DISPLAY]): + err = (f"Invalid {_ORDER_AND_DISPLAY} entry for datatype {datatype} " + + f"at index {i} ") + _check_is_sequence(id_display, err + "- the entry") + if len(id_display) != 2: + raise ValueError(err + "- expected 2 item list") + pid = id_display[0] + _check_string(pid, err + "- parameter ID") + _check_string(id_display[1], err + "- parameter display name") + param_ids.add(pid) + for i, datarow in enumerate(spec[_DATA]): + err = f"Data type {datatype} {_DATA} row {i}" + if type(datarow) != dict: + raise ValueError(err + " is not a mapping") + if datarow.keys() != param_ids: + raise ValueError(err + f" does not have the same keys as {_ORDER_AND_DISPLAY}") + for pid, v in datarow.items(): + if v is not None and not isinstance(v, numbers.Number) and not isinstance(v, str): + raise ValueError( + err + f"'s value for parameter {pid} is not a number or a string") + + +def _check_string(tocheck: Any, errprefix: str): + if not isinstance(tocheck, str) or not tocheck.strip(): + raise ValueError(errprefix + " cannot be a non-string or a whitespace only string") + + +def _check_is_sequence(tocheck: Any, errprefix: str): + if not (isinstance(tocheck, collections.abc.Sequence) and not isinstance(tocheck, str)): + raise ValueError(errprefix + " is not a list") + + +def write_csv( + folder: Path, + types: dict[str, dict[str, list[Any]]], +) -> dict[str, Path]: + """ + Writes import specifications to 1 or more csv files. All the writers in this module + have the same function signatures; see the module level documentation. + """ + return _write_xsv(folder, types, _EXT_CSV, _SEP_CSV) + + +def write_tsv( + folder: Path, + types: dict[str, dict[str, list[Any]]], +) -> dict[str, Path]: + """ + Writes import specifications to 1 or more tsv files. All the writers in this module + have the same function signatures; see the module level documentation. + """ + return _write_xsv(folder, types, _EXT_TSV, _SEP_TSV) + + +def _write_xsv(folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, sep: str): + if not folder: + raise ValueError("The folder cannot be null") + if type(types) != dict: + raise ValueError("The types value must be a mapping") + _check_import_specification(types) + res = {} + for datatype in types: + out = folder / (datatype + "." + ext) + dt = types[datatype] + cols = len(dt[_ORDER_AND_DISPLAY]) + with open(out, "w", newline='') as f: + csvw = csv.writer(f, delimiter=sep) # handle sep escaping + csvw.writerow([f"{_DATA_TYPE} {datatype}{_HEADER_SEP} " + + f"{_COLUMN_STR} {cols}{_HEADER_SEP} {_VERSION_STR} {_VERSION}"]) + pids = [i[0] for i in dt[_ORDER_AND_DISPLAY]] + csvw.writerow(pids) + csvw.writerow([i[1] for i in dt[_ORDER_AND_DISPLAY]]) + for row in dt[_DATA]: + csvw.writerow([row[pid] for pid in pids]) + res[datatype] = out + return res diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py new file mode 100644 index 0000000..d38a4ab --- /dev/null +++ b/tests/import_specifications/test_file_writers.py @@ -0,0 +1,228 @@ +import uuid + +from collections.abc import Generator +from pathlib import Path +from pytest import raises, fixture +from tests.test_app import FileUtil + +from staging_service.import_specifications.file_writers import ( + write_csv, + write_tsv, +) + +from tests.test_utils import assert_exception_correct + + +@fixture(scope="module") +def temp_dir() -> Generator[Path, None, None]: + with FileUtil() as fu: + childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() + + yield childdir + + # FileUtil will auto delete after exiting + + +_TEST_DATA = { + "type1": { + "order_and_display": [ + ("id1", "thing,with,comma"), + ("id2", "other"), + ("id3", "thing\twith\ttabs"), + ], + "data": [ + {"id3": "comma,comma", "id1": "yay!\ttab", "id2": 42}, + {"id3": 56.78, "id1": "boo!", "id2": None}, + ] + }, + "type2": { + "order_and_display": [ + ("id1", "oh no I only have one column"), + ], + "data": [ + {"id1": "foo"}, + {"id1": 0}, + ] + }, + "type3": { + "order_and_display": [ + ("some_id", "hey this"), + ("tab\tid", "xsv is only"), + ("comma,id", "a template"), + ], + "data": [] + } +} + +def test_noop(): + assert write_csv(Path("."), {}) == {} + assert write_tsv(Path("."), {}) == {} + +def test_write_csv(temp_dir: Path): + res = write_csv(Path(temp_dir), _TEST_DATA) + assert res == { + "type1": Path(temp_dir) / "type1.csv", + "type2": Path(temp_dir) / "type2.csv", + "type3": Path(temp_dir) / "type3.csv", + } + _check_contents( + Path(temp_dir) / "type1.csv", + [ + "Data type: type1; Columns: 3; Version: 1\n", + "id1,id2,id3\n", + '"thing,with,comma",other,thing\twith\ttabs\n', + 'yay!\ttab,42,"comma,comma"\n', + "boo!,,56.78\n", + ] + ) + _check_contents( + Path(temp_dir) / "type2.csv", + [ + "Data type: type2; Columns: 1; Version: 1\n", + "id1\n", + "oh no I only have one column\n", + "foo\n", + "0\n", + ] + ) + _check_contents( + Path(temp_dir) / "type3.csv", + [ + "Data type: type3; Columns: 3; Version: 1\n", + 'some_id,tab\tid,"comma,id"\n', + "hey this,xsv is only,a template\n", + ] + ) + + +def test_write_tsv(temp_dir: Path): + res = write_tsv(Path(temp_dir), _TEST_DATA) + assert res == { + "type1": Path(temp_dir) / "type1.tsv", + "type2": Path(temp_dir) / "type2.tsv", + "type3": Path(temp_dir) / "type3.tsv", + } + _check_contents( + Path(temp_dir) / "type1.tsv", + [ + "Data type: type1; Columns: 3; Version: 1\n", + "id1\tid2\tid3\n", + 'thing,with,comma\tother\t"thing\twith\ttabs"\n', + '"yay!\ttab"\t42\tcomma,comma\n', + "boo!\t\t56.78\n", + ] + ) + _check_contents( + Path(temp_dir) / "type2.tsv", + [ + "Data type: type2; Columns: 1; Version: 1\n", + "id1\n", + "oh no I only have one column\n", + "foo\n", + "0\n", + ] + ) + _check_contents( + Path(temp_dir) / "type3.tsv", + [ + "Data type: type3; Columns: 3; Version: 1\n", + 'some_id\t"tab\tid"\tcomma,id\n', + "hey this\txsv is only\ta template\n", + ] + ) + + +def _check_contents(file: Path, lines: list[str]): + with open(file) as f: + assert f.readlines() == lines + + +def test_file_writers_fail(): + p = Path() + file_writers_fail(None, {}, ValueError("The folder cannot be null")) + file_writers_fail(p, None, ValueError("The types value must be a mapping")) + file_writers_fail( + p, {None: 1}, ValueError("A data type cannot be a non-string or a whitespace only string")) + file_writers_fail( + p, + {" \t ": 1}, + ValueError("A data type cannot be a non-string or a whitespace only string")) + file_writers_fail(p, {"t": []}, ValueError("The value for data type t must be a mapping")) + file_writers_fail(p, {"t": 1}, ValueError("The value for data type t must be a mapping")) + file_writers_fail(p, {"t": {}}, ValueError("Data type t missing order_and_display key")) + file_writers_fail( + p, + {"t": {"order_and_display": {}, "data": []}}, + ValueError("Data type t order_and_display value is not a list")) + file_writers_fail( + p, + {"t": {"order_and_display": [], "data": []}}, + ValueError("At least one entry is required for order_and_display for type t")) + file_writers_fail( + p, + {"t": {"order_and_display": [["foo", "bar"]]}}, + ValueError("Data type t missing data key")) + file_writers_fail( + p, + {"t": {"order_and_display": [["foo", "bar"]], "data": "foo"}}, + ValueError("Data type t data value is not a list")) + file_writers_fail( + p, + {"t": {"order_and_display": [["foo", "bar"], "baz"] , "data": []}}, + ValueError("Invalid order_and_display entry for datatype t at index 1 - " + + "the entry is not a list")) + file_writers_fail( + p, + {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]] , "data": []}}, + ValueError("Invalid order_and_display entry for datatype t at index 2 - " + + "expected 2 item list")) + file_writers_fail( + p, + {"t": {"order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]] , "data": []}}, + ValueError("Invalid order_and_display entry for datatype t at index 0 - " + + "expected 2 item list")) + for parm in [None, " \t ", 1]: + file_writers_fail( + p, + {"t": {"order_and_display": [(parm, "foo"), ["whee", "whoo"]], "data": []}}, + ValueError("Invalid order_and_display entry for datatype t at index 0 - " + + "parameter ID cannot be a non-string or a whitespace only string")) + file_writers_fail( + p, + {"t": {"order_and_display": [("bar", "foo"), ["whee", parm]], "data": []}}, + ValueError("Invalid order_and_display entry for datatype t at index 1 - " + + "parameter display name cannot be a non-string or a whitespace only string")) + file_writers_fail( + p, + {"t": {"order_and_display": [("bar", "foo"), ["whee", "whoo"]], "data": ["foo"]}}, + ValueError("Data type t data row 0 is not a mapping")) + file_writers_fail( + p, + {"t": {"order_and_display": [("bar", "foo")], "data": [{"bar": 1}, []]}}, + ValueError("Data type t data row 1 is not a mapping")) + file_writers_fail( + p, + {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 1, "whee": 2}, {"foo": 2}]}}, + ValueError("Data type t data row 1 does not have the same keys as order_and_display")) + file_writers_fail( + p, + {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}]}}, + ValueError("Data type ty data row 0 does not have the same keys as order_and_display")) + file_writers_fail( + p, + {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}]}}, + ValueError("Data type ty data row 1's value for parameter whee " + + "is not a number or a string")) + + + +def file_writers_fail(path: Path, types: dict, expected: Exception): + with raises(Exception) as got: + write_csv(path, types) + assert_exception_correct(got.value, expected) + with raises(Exception) as got: + write_tsv(path, types) + assert_exception_correct(got.value, expected) \ No newline at end of file From f52b390b5127728168159a6521c869da4e51a1fb Mon Sep 17 00:00:00 2001 From: Gavin Date: Fri, 24 Dec 2021 15:26:13 -0800 Subject: [PATCH 2/5] Fix up docs --- .../import_specifications/file_writers.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 13a5b50..6ab7ccb 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -3,29 +3,30 @@ The names of the files will be the datatype suffixed by the file extension unless the writer handles Excel or similar files that can contain multiple datatypes, in which case the -file name will be {IMPORT_SPEC_FILE_NAME} suffixed by the extension. +file name will be import_specification suffixed by the extension. All the write_* functions in this module have the same function signature: :param folder: Where the files should be written. The folder must exist. :param types: The import specifications to write. This is a dictionary of data types as strings to the specifications for the data type. Each specification has two required keys: - * `order_and_titles`: this is a list of tuples. Each iner tuple has two elements: + * `order_and_display`: this is a list of lists. Each inner list has two elements: * The parameter ID of a parameter. This is typically the `id` field from the KBase app `spec.json` file. * The display name of the parameter. This is typically the `ui-name` field from the KBase app `display.yaml` file. - The order of the inner tuples in the outer tuple defines the order of the columns + The order of the inner lists in the outer list defines the order of the columns in the resulting import specification files. - * `data`: this is a list of str->str dicts. The keys of the dicts are the parameter IDs - as described above, while the values are the values of the parameters. Each dict - must have exactly the same keys as the `order_and_titles` structure. Each entry in the - list corresponds to a row in the resulting import specification, and the order of - the list defines the order of the rows. + * `data`: this is a list of str->str or number dicts. The keys of the dicts are the + parameter IDs as described above, while the values are the values of the parameters. + Each dict must have exactly the same keys as the `order_and_display` structure. Each + entry in the list corresponds to a row in the resulting import specification, + and the order of the list defines the order of the rows. Leave the `data` list empty to write an empty template. :returns: A mapping of the data types to the files to which they were written. """ - +# note that we can't use an f string here to interpolate the variables below, e.g. +# order_and_display, etc. import collections import csv @@ -70,11 +71,11 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): KBase app `display.yaml` file. The order of the inner lists in the outer list defines the order of the columns in the resulting import specification files. - * {_DATA}: this is a list of str->str dicts. The keys of the dicts are the parameter IDs - as described above, while the values are the values of the parameters. Each dict - must have exactly the same keys as the {_ORDER_AND_DISPLAY} structure. Each entry in the - list corresponds to a row in the resulting import specification, and the order of - the list defines the order of the rows. + * {_DATA}: this is a list of str->str or number dicts. The keys of the dicts are the + parameter IDs as described above, while the values are the values of the parameters. + Each dict must have exactly the same keys as the {_ORDER_AND_DISPLAY} structure. + Each entry in the list corresponds to a row in the resulting import specification, + and the order of the list defines the order of the rows. Leave the {_DATA} list empty to write an empty template. """ if not types: From 4d4f878d7b407180dd1235cee0fe956d3d04bda2 Mon Sep 17 00:00:00 2001 From: Gavin Date: Sun, 26 Dec 2021 10:43:10 -0800 Subject: [PATCH 3/5] Remove extraneous Path() calls, minor refactoring --- .../import_specifications/file_writers.py | 26 ++++++++--------- .../test_file_writers.py | 28 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 6ab7ccb..1991133 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -130,10 +130,9 @@ def _check_is_sequence(tocheck: Any, errprefix: str): raise ValueError(errprefix + " is not a list") -def write_csv( - folder: Path, - types: dict[str, dict[str, list[Any]]], -) -> dict[str, Path]: +# TODO WRITE_XSV look into server OOM protection if the user sends a huge JSON packet + +def write_csv(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[str, Path]: """ Writes import specifications to 1 or more csv files. All the writers in this module have the same function signatures; see the module level documentation. @@ -141,10 +140,7 @@ def write_csv( return _write_xsv(folder, types, _EXT_CSV, _SEP_CSV) -def write_tsv( - folder: Path, - types: dict[str, dict[str, list[Any]]], -) -> dict[str, Path]: +def write_tsv(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[str, Path]: """ Writes import specifications to 1 or more tsv files. All the writers in this module have the same function signatures; see the module level documentation. @@ -153,11 +149,7 @@ def write_tsv( def _write_xsv(folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, sep: str): - if not folder: - raise ValueError("The folder cannot be null") - if type(types) != dict: - raise ValueError("The types value must be a mapping") - _check_import_specification(types) + _check_write_args(folder, types) res = {} for datatype in types: out = folder / (datatype + "." + ext) @@ -174,3 +166,11 @@ def _write_xsv(folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, s csvw.writerow([row[pid] for pid in pids]) res[datatype] = out return res + + +def _check_write_args(folder: Path, types: dict[str, dict[str, list[Any]]]): + if not folder: + raise ValueError("The folder cannot be null") + if type(types) != dict: + raise ValueError("The types value must be a mapping") + _check_import_specification(types) \ No newline at end of file diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index d38a4ab..8e4bb24 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -59,14 +59,14 @@ def test_noop(): assert write_tsv(Path("."), {}) == {} def test_write_csv(temp_dir: Path): - res = write_csv(Path(temp_dir), _TEST_DATA) + res = write_csv(temp_dir, _TEST_DATA) assert res == { - "type1": Path(temp_dir) / "type1.csv", - "type2": Path(temp_dir) / "type2.csv", - "type3": Path(temp_dir) / "type3.csv", + "type1": temp_dir / "type1.csv", + "type2": temp_dir / "type2.csv", + "type3": temp_dir / "type3.csv", } _check_contents( - Path(temp_dir) / "type1.csv", + temp_dir / "type1.csv", [ "Data type: type1; Columns: 3; Version: 1\n", "id1,id2,id3\n", @@ -76,7 +76,7 @@ def test_write_csv(temp_dir: Path): ] ) _check_contents( - Path(temp_dir) / "type2.csv", + temp_dir / "type2.csv", [ "Data type: type2; Columns: 1; Version: 1\n", "id1\n", @@ -86,7 +86,7 @@ def test_write_csv(temp_dir: Path): ] ) _check_contents( - Path(temp_dir) / "type3.csv", + temp_dir / "type3.csv", [ "Data type: type3; Columns: 3; Version: 1\n", 'some_id,tab\tid,"comma,id"\n', @@ -96,14 +96,14 @@ def test_write_csv(temp_dir: Path): def test_write_tsv(temp_dir: Path): - res = write_tsv(Path(temp_dir), _TEST_DATA) + res = write_tsv(temp_dir, _TEST_DATA) assert res == { - "type1": Path(temp_dir) / "type1.tsv", - "type2": Path(temp_dir) / "type2.tsv", - "type3": Path(temp_dir) / "type3.tsv", + "type1": temp_dir / "type1.tsv", + "type2": temp_dir / "type2.tsv", + "type3": temp_dir / "type3.tsv", } _check_contents( - Path(temp_dir) / "type1.tsv", + temp_dir / "type1.tsv", [ "Data type: type1; Columns: 3; Version: 1\n", "id1\tid2\tid3\n", @@ -113,7 +113,7 @@ def test_write_tsv(temp_dir: Path): ] ) _check_contents( - Path(temp_dir) / "type2.tsv", + temp_dir / "type2.tsv", [ "Data type: type2; Columns: 1; Version: 1\n", "id1\n", @@ -123,7 +123,7 @@ def test_write_tsv(temp_dir: Path): ] ) _check_contents( - Path(temp_dir) / "type3.tsv", + temp_dir / "type3.tsv", [ "Data type: type3; Columns: 3; Version: 1\n", 'some_id\t"tab\tid"\tcomma,id\n', From e5d619ead1ab08b963081e1c592c098a27b4d3d4 Mon Sep 17 00:00:00 2001 From: Gavin Date: Sun, 26 Dec 2021 10:53:07 -0800 Subject: [PATCH 4/5] Add custom write exception Allows the app class to tell the difference between bad input and unexpected failures (e.g. fail to open file, etc) --- .../import_specifications/file_writers.py | 38 +++++++++++------ .../test_file_writers.py | 41 ++++++++++--------- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 1991133..5f76cdf 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -85,16 +85,17 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): _check_string(datatype, "A data type") spec = types[datatype] if type(spec) != dict: - raise ValueError(f"The value for data type {datatype} must be a mapping") + raise ImportSpecWriteException(f"The value for data type {datatype} must be a mapping") if _ORDER_AND_DISPLAY not in spec: - raise ValueError(f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key") + raise ImportSpecWriteException( + f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key") _check_is_sequence( spec[_ORDER_AND_DISPLAY], f"Data type {datatype} {_ORDER_AND_DISPLAY} value") if not len(spec[_ORDER_AND_DISPLAY]): - raise ValueError(f"At least one entry is required for {_ORDER_AND_DISPLAY} " - + f"for type {datatype}") + raise ImportSpecWriteException( + f"At least one entry is required for {_ORDER_AND_DISPLAY} for type {datatype}") if _DATA not in spec: - raise ValueError(f"Data type {datatype} missing {_DATA} key") + raise ImportSpecWriteException(f"Data type {datatype} missing {_DATA} key") _check_is_sequence(spec[_DATA], f"Data type {datatype} {_DATA} value") param_ids = set() @@ -103,7 +104,7 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): + f"at index {i} ") _check_is_sequence(id_display, err + "- the entry") if len(id_display) != 2: - raise ValueError(err + "- expected 2 item list") + raise ImportSpecWriteException(err + "- expected 2 item list") pid = id_display[0] _check_string(pid, err + "- parameter ID") _check_string(id_display[1], err + "- parameter display name") @@ -111,23 +112,25 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): for i, datarow in enumerate(spec[_DATA]): err = f"Data type {datatype} {_DATA} row {i}" if type(datarow) != dict: - raise ValueError(err + " is not a mapping") + raise ImportSpecWriteException(err + " is not a mapping") if datarow.keys() != param_ids: - raise ValueError(err + f" does not have the same keys as {_ORDER_AND_DISPLAY}") + raise ImportSpecWriteException( + err + f" does not have the same keys as {_ORDER_AND_DISPLAY}") for pid, v in datarow.items(): if v is not None and not isinstance(v, numbers.Number) and not isinstance(v, str): - raise ValueError( + raise ImportSpecWriteException( err + f"'s value for parameter {pid} is not a number or a string") def _check_string(tocheck: Any, errprefix: str): if not isinstance(tocheck, str) or not tocheck.strip(): - raise ValueError(errprefix + " cannot be a non-string or a whitespace only string") + raise ImportSpecWriteException( + errprefix + " cannot be a non-string or a whitespace only string") def _check_is_sequence(tocheck: Any, errprefix: str): if not (isinstance(tocheck, collections.abc.Sequence) and not isinstance(tocheck, str)): - raise ValueError(errprefix + " is not a list") + raise ImportSpecWriteException(errprefix + " is not a list") # TODO WRITE_XSV look into server OOM protection if the user sends a huge JSON packet @@ -170,7 +173,16 @@ def _write_xsv(folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, s def _check_write_args(folder: Path, types: dict[str, dict[str, list[Any]]]): if not folder: + # this is a programming error, not a user input error, so not using the custom + # exception here raise ValueError("The folder cannot be null") if type(types) != dict: - raise ValueError("The types value must be a mapping") - _check_import_specification(types) \ No newline at end of file + raise ImportSpecWriteException("The types value must be a mapping") + _check_import_specification(types) + + +class ImportSpecWriteException(Exception): + """ + An exception thrown when writing an import specification fails. + """ + pass \ No newline at end of file diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index 8e4bb24..f00c25a 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -8,6 +8,7 @@ from staging_service.import_specifications.file_writers import ( write_csv, write_tsv, + ImportSpecWriteException, ) from tests.test_utils import assert_exception_correct @@ -139,82 +140,82 @@ def _check_contents(file: Path, lines: list[str]): def test_file_writers_fail(): p = Path() + E = ImportSpecWriteException file_writers_fail(None, {}, ValueError("The folder cannot be null")) - file_writers_fail(p, None, ValueError("The types value must be a mapping")) file_writers_fail( - p, {None: 1}, ValueError("A data type cannot be a non-string or a whitespace only string")) + p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string")) file_writers_fail( p, {" \t ": 1}, - ValueError("A data type cannot be a non-string or a whitespace only string")) - file_writers_fail(p, {"t": []}, ValueError("The value for data type t must be a mapping")) - file_writers_fail(p, {"t": 1}, ValueError("The value for data type t must be a mapping")) - file_writers_fail(p, {"t": {}}, ValueError("Data type t missing order_and_display key")) + E("A data type cannot be a non-string or a whitespace only string")) + file_writers_fail(p, {"t": []}, E("The value for data type t must be a mapping")) + file_writers_fail(p, {"t": 1}, E("The value for data type t must be a mapping")) + file_writers_fail(p, {"t": {}}, E("Data type t missing order_and_display key")) file_writers_fail( p, {"t": {"order_and_display": {}, "data": []}}, - ValueError("Data type t order_and_display value is not a list")) + E("Data type t order_and_display value is not a list")) file_writers_fail( p, {"t": {"order_and_display": [], "data": []}}, - ValueError("At least one entry is required for order_and_display for type t")) + E("At least one entry is required for order_and_display for type t")) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]]}}, - ValueError("Data type t missing data key")) + E("Data type t missing data key")) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]], "data": "foo"}}, - ValueError("Data type t data value is not a list")) + E("Data type t data value is not a list")) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"], "baz"] , "data": []}}, - ValueError("Invalid order_and_display entry for datatype t at index 1 - " + E("Invalid order_and_display entry for datatype t at index 1 - " + "the entry is not a list")) file_writers_fail( p, {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]] , "data": []}}, - ValueError("Invalid order_and_display entry for datatype t at index 2 - " + E("Invalid order_and_display entry for datatype t at index 2 - " + "expected 2 item list")) file_writers_fail( p, {"t": {"order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]] , "data": []}}, - ValueError("Invalid order_and_display entry for datatype t at index 0 - " + E("Invalid order_and_display entry for datatype t at index 0 - " + "expected 2 item list")) for parm in [None, " \t ", 1]: file_writers_fail( p, {"t": {"order_and_display": [(parm, "foo"), ["whee", "whoo"]], "data": []}}, - ValueError("Invalid order_and_display entry for datatype t at index 0 - " + E("Invalid order_and_display entry for datatype t at index 0 - " + "parameter ID cannot be a non-string or a whitespace only string")) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo"), ["whee", parm]], "data": []}}, - ValueError("Invalid order_and_display entry for datatype t at index 1 - " + E("Invalid order_and_display entry for datatype t at index 1 - " + "parameter display name cannot be a non-string or a whitespace only string")) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo"), ["whee", "whoo"]], "data": ["foo"]}}, - ValueError("Data type t data row 0 is not a mapping")) + E("Data type t data row 0 is not a mapping")) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo")], "data": [{"bar": 1}, []]}}, - ValueError("Data type t data row 1 is not a mapping")) + E("Data type t data row 1 is not a mapping")) file_writers_fail( p, {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], "data": [{"foo": 1, "whee": 2}, {"foo": 2}]}}, - ValueError("Data type t data row 1 does not have the same keys as order_and_display")) + E("Data type t data row 1 does not have the same keys as order_and_display")) file_writers_fail( p, {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}]}}, - ValueError("Data type ty data row 0 does not have the same keys as order_and_display")) + E("Data type ty data row 0 does not have the same keys as order_and_display")) file_writers_fail( p, {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}]}}, - ValueError("Data type ty data row 1's value for parameter whee " + E("Data type ty data row 1's value for parameter whee " + "is not a number or a string")) From d3a52f27bbfcd21ef76d0616a6d62333e3c7f2ea Mon Sep 17 00:00:00 2001 From: Gavin Date: Sun, 26 Dec 2021 11:06:07 -0800 Subject: [PATCH 5/5] restore missing test line --- tests/import_specifications/test_file_writers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index f00c25a..8021e4a 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -142,6 +142,7 @@ def test_file_writers_fail(): p = Path() E = ImportSpecWriteException file_writers_fail(None, {}, ValueError("The folder cannot be null")) + file_writers_fail(p, None, E("The types value must be a mapping")) file_writers_fail( p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string")) file_writers_fail(