diff --git a/.gitignore b/.gitignore index f0be0c6..00b31a5 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,5 @@ ENV/ data/ .DS_Store .virtualenvs/ -test.env \ No newline at end of file +test.env +run_tests_single.sh \ No newline at end of file diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py new file mode 100644 index 0000000..5f76cdf --- /dev/null +++ b/staging_service/import_specifications/file_writers.py @@ -0,0 +1,188 @@ +""" +Write an import specification to one or more files. + +The names of the files will be the datatype suffixed by the file extension unless the writer +handles Excel or similar files that can contain multiple datatypes, in which case the +file name will be import_specification suffixed by the extension. + +All the write_* functions in this module have the same function signature: + +:param folder: Where the files should be written. The folder must exist. +:param types: The import specifications to write. This is a dictionary of data types as strings + to the specifications for the data type. Each specification has two required keys: + * `order_and_display`: this is a list of lists. Each inner list has two elements: + * The parameter ID of a parameter. This is typically the `id` field from the + KBase app `spec.json` file. + * The display name of the parameter. This is typically the `ui-name` field from the + KBase app `display.yaml` file. + The order of the inner lists in the outer list defines the order of the columns + in the resulting import specification files. + * `data`: this is a list of str->str or number dicts. The keys of the dicts are the + parameter IDs as described above, while the values are the values of the parameters. + Each dict must have exactly the same keys as the `order_and_display` structure. Each + entry in the list corresponds to a row in the resulting import specification, + and the order of the list defines the order of the rows. + Leave the `data` list empty to write an empty template. +:returns: A mapping of the data types to the files to which they were written. +""" +# note that we can't use an f string here to interpolate the variables below, e.g. +# order_and_display, etc. + +import collections +import csv +import numbers + +from typing import Any +from pathlib import Path + +# this version is synonymous to the versions in individual_parsers.py. However, this module +# should only ever write the most recent format for import specifictions, while the parsers +# may need to also be able to parse earlier versions. +_VERSION = 1 + +# these are the same as in individual_parsers.py. They might change from version to version so +# have a separate copy here. +_DATA_TYPE = "Data type:" +_VERSION_STR = "Version:" +_COLUMN_STR = "Columns:" +_HEADER_SEP = ";" + +_IMPORT_SPEC_FILE_NAME = "import_specification" + +_ORDER_AND_DISPLAY = "order_and_display" +_DATA = "data" +_EXT_CSV = "csv" +_EXT_TSV = "tsv" +_EXT_EXCEL = "xlsx" +_SEP_CSV = "," +_SEP_TSV = "\t" + +def _check_import_specification(types: dict[str, dict[str, list[Any]]]): + f""" + Check the structure of an import specification data structure. If the input is empty the + result is a noop. + + :param types: The import specifications to check. This is a dictionary of data types as strings + to the specifications for the data type. Each specification has two required keys: + * {_ORDER_AND_DISPLAY}: this is a list of lists. Each inner list has two elements: + * The parameter ID of a parameter. This is typically the `id` field from the + KBase app `spec.json` file. + * The display name of the parameter. This is typically the `ui-name` field from the + KBase app `display.yaml` file. + The order of the inner lists in the outer list defines the order of the columns + in the resulting import specification files. + * {_DATA}: this is a list of str->str or number dicts. The keys of the dicts are the + parameter IDs as described above, while the values are the values of the parameters. + Each dict must have exactly the same keys as the {_ORDER_AND_DISPLAY} structure. + Each entry in the list corresponds to a row in the resulting import specification, + and the order of the list defines the order of the rows. + Leave the {_DATA} list empty to write an empty template. + """ + if not types: + return + for datatype in types: + # replace this with jsonschema? don't worry about it for now + _check_string(datatype, "A data type") + spec = types[datatype] + if type(spec) != dict: + raise ImportSpecWriteException(f"The value for data type {datatype} must be a mapping") + if _ORDER_AND_DISPLAY not in spec: + raise ImportSpecWriteException( + f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key") + _check_is_sequence( + spec[_ORDER_AND_DISPLAY], f"Data type {datatype} {_ORDER_AND_DISPLAY} value") + if not len(spec[_ORDER_AND_DISPLAY]): + raise ImportSpecWriteException( + f"At least one entry is required for {_ORDER_AND_DISPLAY} for type {datatype}") + if _DATA not in spec: + raise ImportSpecWriteException(f"Data type {datatype} missing {_DATA} key") + _check_is_sequence(spec[_DATA], f"Data type {datatype} {_DATA} value") + + param_ids = set() + for i, id_display in enumerate(spec[_ORDER_AND_DISPLAY]): + err = (f"Invalid {_ORDER_AND_DISPLAY} entry for datatype {datatype} " + + f"at index {i} ") + _check_is_sequence(id_display, err + "- the entry") + if len(id_display) != 2: + raise ImportSpecWriteException(err + "- expected 2 item list") + pid = id_display[0] + _check_string(pid, err + "- parameter ID") + _check_string(id_display[1], err + "- parameter display name") + param_ids.add(pid) + for i, datarow in enumerate(spec[_DATA]): + err = f"Data type {datatype} {_DATA} row {i}" + if type(datarow) != dict: + raise ImportSpecWriteException(err + " is not a mapping") + if datarow.keys() != param_ids: + raise ImportSpecWriteException( + err + f" does not have the same keys as {_ORDER_AND_DISPLAY}") + for pid, v in datarow.items(): + if v is not None and not isinstance(v, numbers.Number) and not isinstance(v, str): + raise ImportSpecWriteException( + err + f"'s value for parameter {pid} is not a number or a string") + + +def _check_string(tocheck: Any, errprefix: str): + if not isinstance(tocheck, str) or not tocheck.strip(): + raise ImportSpecWriteException( + errprefix + " cannot be a non-string or a whitespace only string") + + +def _check_is_sequence(tocheck: Any, errprefix: str): + if not (isinstance(tocheck, collections.abc.Sequence) and not isinstance(tocheck, str)): + raise ImportSpecWriteException(errprefix + " is not a list") + + +# TODO WRITE_XSV look into server OOM protection if the user sends a huge JSON packet + +def write_csv(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[str, Path]: + """ + Writes import specifications to 1 or more csv files. All the writers in this module + have the same function signatures; see the module level documentation. + """ + return _write_xsv(folder, types, _EXT_CSV, _SEP_CSV) + + +def write_tsv(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[str, Path]: + """ + Writes import specifications to 1 or more tsv files. All the writers in this module + have the same function signatures; see the module level documentation. + """ + return _write_xsv(folder, types, _EXT_TSV, _SEP_TSV) + + +def _write_xsv(folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, sep: str): + _check_write_args(folder, types) + res = {} + for datatype in types: + out = folder / (datatype + "." + ext) + dt = types[datatype] + cols = len(dt[_ORDER_AND_DISPLAY]) + with open(out, "w", newline='') as f: + csvw = csv.writer(f, delimiter=sep) # handle sep escaping + csvw.writerow([f"{_DATA_TYPE} {datatype}{_HEADER_SEP} " + + f"{_COLUMN_STR} {cols}{_HEADER_SEP} {_VERSION_STR} {_VERSION}"]) + pids = [i[0] for i in dt[_ORDER_AND_DISPLAY]] + csvw.writerow(pids) + csvw.writerow([i[1] for i in dt[_ORDER_AND_DISPLAY]]) + for row in dt[_DATA]: + csvw.writerow([row[pid] for pid in pids]) + res[datatype] = out + return res + + +def _check_write_args(folder: Path, types: dict[str, dict[str, list[Any]]]): + if not folder: + # this is a programming error, not a user input error, so not using the custom + # exception here + raise ValueError("The folder cannot be null") + if type(types) != dict: + raise ImportSpecWriteException("The types value must be a mapping") + _check_import_specification(types) + + +class ImportSpecWriteException(Exception): + """ + An exception thrown when writing an import specification fails. + """ + pass \ No newline at end of file diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py new file mode 100644 index 0000000..8021e4a --- /dev/null +++ b/tests/import_specifications/test_file_writers.py @@ -0,0 +1,230 @@ +import uuid + +from collections.abc import Generator +from pathlib import Path +from pytest import raises, fixture +from tests.test_app import FileUtil + +from staging_service.import_specifications.file_writers import ( + write_csv, + write_tsv, + ImportSpecWriteException, +) + +from tests.test_utils import assert_exception_correct + + +@fixture(scope="module") +def temp_dir() -> Generator[Path, None, None]: + with FileUtil() as fu: + childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() + + yield childdir + + # FileUtil will auto delete after exiting + + +_TEST_DATA = { + "type1": { + "order_and_display": [ + ("id1", "thing,with,comma"), + ("id2", "other"), + ("id3", "thing\twith\ttabs"), + ], + "data": [ + {"id3": "comma,comma", "id1": "yay!\ttab", "id2": 42}, + {"id3": 56.78, "id1": "boo!", "id2": None}, + ] + }, + "type2": { + "order_and_display": [ + ("id1", "oh no I only have one column"), + ], + "data": [ + {"id1": "foo"}, + {"id1": 0}, + ] + }, + "type3": { + "order_and_display": [ + ("some_id", "hey this"), + ("tab\tid", "xsv is only"), + ("comma,id", "a template"), + ], + "data": [] + } +} + +def test_noop(): + assert write_csv(Path("."), {}) == {} + assert write_tsv(Path("."), {}) == {} + +def test_write_csv(temp_dir: Path): + res = write_csv(temp_dir, _TEST_DATA) + assert res == { + "type1": temp_dir / "type1.csv", + "type2": temp_dir / "type2.csv", + "type3": temp_dir / "type3.csv", + } + _check_contents( + temp_dir / "type1.csv", + [ + "Data type: type1; Columns: 3; Version: 1\n", + "id1,id2,id3\n", + '"thing,with,comma",other,thing\twith\ttabs\n', + 'yay!\ttab,42,"comma,comma"\n', + "boo!,,56.78\n", + ] + ) + _check_contents( + temp_dir / "type2.csv", + [ + "Data type: type2; Columns: 1; Version: 1\n", + "id1\n", + "oh no I only have one column\n", + "foo\n", + "0\n", + ] + ) + _check_contents( + temp_dir / "type3.csv", + [ + "Data type: type3; Columns: 3; Version: 1\n", + 'some_id,tab\tid,"comma,id"\n', + "hey this,xsv is only,a template\n", + ] + ) + + +def test_write_tsv(temp_dir: Path): + res = write_tsv(temp_dir, _TEST_DATA) + assert res == { + "type1": temp_dir / "type1.tsv", + "type2": temp_dir / "type2.tsv", + "type3": temp_dir / "type3.tsv", + } + _check_contents( + temp_dir / "type1.tsv", + [ + "Data type: type1; Columns: 3; Version: 1\n", + "id1\tid2\tid3\n", + 'thing,with,comma\tother\t"thing\twith\ttabs"\n', + '"yay!\ttab"\t42\tcomma,comma\n', + "boo!\t\t56.78\n", + ] + ) + _check_contents( + temp_dir / "type2.tsv", + [ + "Data type: type2; Columns: 1; Version: 1\n", + "id1\n", + "oh no I only have one column\n", + "foo\n", + "0\n", + ] + ) + _check_contents( + temp_dir / "type3.tsv", + [ + "Data type: type3; Columns: 3; Version: 1\n", + 'some_id\t"tab\tid"\tcomma,id\n', + "hey this\txsv is only\ta template\n", + ] + ) + + +def _check_contents(file: Path, lines: list[str]): + with open(file) as f: + assert f.readlines() == lines + + +def test_file_writers_fail(): + p = Path() + E = ImportSpecWriteException + file_writers_fail(None, {}, ValueError("The folder cannot be null")) + file_writers_fail(p, None, E("The types value must be a mapping")) + file_writers_fail( + p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string")) + file_writers_fail( + p, + {" \t ": 1}, + E("A data type cannot be a non-string or a whitespace only string")) + file_writers_fail(p, {"t": []}, E("The value for data type t must be a mapping")) + file_writers_fail(p, {"t": 1}, E("The value for data type t must be a mapping")) + file_writers_fail(p, {"t": {}}, E("Data type t missing order_and_display key")) + file_writers_fail( + p, + {"t": {"order_and_display": {}, "data": []}}, + E("Data type t order_and_display value is not a list")) + file_writers_fail( + p, + {"t": {"order_and_display": [], "data": []}}, + E("At least one entry is required for order_and_display for type t")) + file_writers_fail( + p, + {"t": {"order_and_display": [["foo", "bar"]]}}, + E("Data type t missing data key")) + file_writers_fail( + p, + {"t": {"order_and_display": [["foo", "bar"]], "data": "foo"}}, + E("Data type t data value is not a list")) + file_writers_fail( + p, + {"t": {"order_and_display": [["foo", "bar"], "baz"] , "data": []}}, + E("Invalid order_and_display entry for datatype t at index 1 - " + + "the entry is not a list")) + file_writers_fail( + p, + {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]] , "data": []}}, + E("Invalid order_and_display entry for datatype t at index 2 - " + + "expected 2 item list")) + file_writers_fail( + p, + {"t": {"order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]] , "data": []}}, + E("Invalid order_and_display entry for datatype t at index 0 - " + + "expected 2 item list")) + for parm in [None, " \t ", 1]: + file_writers_fail( + p, + {"t": {"order_and_display": [(parm, "foo"), ["whee", "whoo"]], "data": []}}, + E("Invalid order_and_display entry for datatype t at index 0 - " + + "parameter ID cannot be a non-string or a whitespace only string")) + file_writers_fail( + p, + {"t": {"order_and_display": [("bar", "foo"), ["whee", parm]], "data": []}}, + E("Invalid order_and_display entry for datatype t at index 1 - " + + "parameter display name cannot be a non-string or a whitespace only string")) + file_writers_fail( + p, + {"t": {"order_and_display": [("bar", "foo"), ["whee", "whoo"]], "data": ["foo"]}}, + E("Data type t data row 0 is not a mapping")) + file_writers_fail( + p, + {"t": {"order_and_display": [("bar", "foo")], "data": [{"bar": 1}, []]}}, + E("Data type t data row 1 is not a mapping")) + file_writers_fail( + p, + {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 1, "whee": 2}, {"foo": 2}]}}, + E("Data type t data row 1 does not have the same keys as order_and_display")) + file_writers_fail( + p, + {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}]}}, + E("Data type ty data row 0 does not have the same keys as order_and_display")) + file_writers_fail( + p, + {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}]}}, + E("Data type ty data row 1's value for parameter whee " + + "is not a number or a string")) + + + +def file_writers_fail(path: Path, types: dict, expected: Exception): + with raises(Exception) as got: + write_csv(path, types) + assert_exception_correct(got.value, expected) + with raises(Exception) as got: + write_tsv(path, types) + assert_exception_correct(got.value, expected) \ No newline at end of file