From b12fe74a9a46c92302eba221766d0db85e69317e Mon Sep 17 00:00:00 2001 From: David Raznick Date: Thu, 28 Jan 2021 01:32:34 +0000 Subject: [PATCH 1/2] Flattening: Reduce memory Footprint. * Use ijson * Use pyopenxl write_only mode * Store sheet lines in an embedded btree ZODB index https://github.com/OpenDataServices/flatten-tool/issues/316 --- .github/workflows/test.yml | 2 +- CHANGELOG.md | 1 + flattentool/__init__.py | 50 ++++---- flattentool/json_input.py | 108 ++++++++++++++---- flattentool/output.py | 3 +- flattentool/sheet.py | 47 +++++++- flattentool/tests/test_json_input.py | 100 +++++++--------- .../test_json_input_is_unflatten_reversed.py | 8 +- flattentool/tests/test_output.py | 14 ++- flattentool/tests/test_xml_input.py | 34 +++--- setup.py | 3 + 11 files changed, 229 insertions(+), 141 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 359a24eb..29fa04b0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,4 +24,4 @@ jobs: - run: py.test --cov . - env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: coveralls + run: coveralls --service=github diff --git a/CHANGELOG.md b/CHANGELOG.md index ce5eb884..b916ee1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Fixed +- flattening: Uses much less memory by storing data in a embedded ZODB database, using ijson and using write only mode in pyopenxl. - use-titles: Use $ref'erring title if available https://github.com/OpenDataServices/flatten-tool/pull/368 - create-template --no-deprecated-fields: Did not work if deprecated element at same level as a $ref https://github.com/OpenDataServices/flatten-tool/issues/185#issuecomment-719587348 diff --git a/flattentool/__init__.py b/flattentool/__init__.py index 5c4f4bbf..b700353a 100644 --- a/flattentool/__init__.py +++ b/flattentool/__init__.py @@ -112,7 +112,7 @@ def flatten( else: schema_parser = None - parser = JSONParser( + with JSONParser( json_filename=input_name, root_list_path=None if root_is_list else root_list_path, schema_parser=schema_parser, @@ -126,33 +126,33 @@ def flatten( preserve_fields=preserve_fields, remove_empty_schema_columns=remove_empty_schema_columns, truncation_length=truncation_length, - ) - parser.parse() - - def spreadsheet_output(spreadsheet_output_class, name): - spreadsheet_output = spreadsheet_output_class( - parser=parser, - main_sheet_name=main_sheet_name, - output_name=name, - sheet_prefix=sheet_prefix, - ) - spreadsheet_output.write_sheets() - - if output_format == "all": - if not output_name: - output_name = "flattened" - for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items(): - spreadsheet_output( - spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name] + persist=True, + ) as parser: + + def spreadsheet_output(spreadsheet_output_class, name): + spreadsheet_output = spreadsheet_output_class( + parser=parser, + main_sheet_name=main_sheet_name, + output_name=name, + sheet_prefix=sheet_prefix, ) + spreadsheet_output.write_sheets() + + if output_format == "all": + if not output_name: + output_name = "flattened" + for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items(): + spreadsheet_output( + spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name] + ) - elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats - if not output_name: - output_name = "flattened" + FORMATS_SUFFIX[output_format] - spreadsheet_output(OUTPUT_FORMATS[output_format], output_name) + elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats + if not output_name: + output_name = "flattened" + FORMATS_SUFFIX[output_format] + spreadsheet_output(OUTPUT_FORMATS[output_format], output_name) - else: - raise Exception("The requested format is not available") + else: + raise Exception("The requested format is not available") # From http://bugs.python.org/issue16535 diff --git a/flattentool/json_input.py b/flattentool/json_input.py index fa9634d8..79567c0e 100644 --- a/flattentool/json_input.py +++ b/flattentool/json_input.py @@ -7,18 +7,24 @@ import codecs import copy -import json import os +import tempfile +import uuid from collections import OrderedDict from decimal import Decimal from warnings import warn +import BTrees.OOBTree +import ijson +import transaction import xmltodict +import zc.zlibstorage +import ZODB.FileStorage from flattentool.i18n import _ from flattentool.input import path_search from flattentool.schema import make_sub_sheet_name -from flattentool.sheet import Sheet +from flattentool.sheet import PersistentSheet BASIC_TYPES = [str, bool, int, Decimal, type(None)] @@ -112,9 +118,26 @@ def __init__( remove_empty_schema_columns=False, rollup=False, truncation_length=3, + persist=False, ): + if persist: + self.zodb_db_location = ( + tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4()) + ) + zodb_storage = zc.zlibstorage.ZlibStorage( + ZODB.FileStorage.FileStorage(self.zodb_db_location) + ) + self.db = ZODB.DB(zodb_storage) + else: + # If None, in memory storage is used. + self.db = ZODB.DB(None) + + self.connection = self.db.open() + root = self.connection.root + root.sheet_store = BTrees.OOBTree.BTree() + self.sub_sheets = {} - self.main_sheet = Sheet() + self.main_sheet = PersistentSheet(connection=self.connection, name="") self.root_list_path = root_list_path self.root_id = root_id self.use_titles = use_titles @@ -125,9 +148,17 @@ def __init__( self.filter_value = filter_value self.remove_empty_schema_columns = remove_empty_schema_columns self.seen_paths = set() + self.persist = persist if schema_parser: - self.main_sheet = copy.deepcopy(schema_parser.main_sheet) + self.main_sheet = PersistentSheet.from_sheet( + schema_parser.main_sheet, self.connection + ) + for sheet_name, sheet in list(self.sub_sheets.items()): + self.sub_sheets[sheet_name] = PersistentSheet.from_sheet( + sheet, self.connection + ) + self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets) if remove_empty_schema_columns: # Don't use columns from the schema parser @@ -194,18 +225,13 @@ def __init__( _("Only one of json_file or root_json_dict should be supplied") ) - if json_filename: - with codecs.open(json_filename, encoding="utf-8") as json_file: - try: - self.root_json_dict = json.load( - json_file, object_pairs_hook=OrderedDict, parse_float=Decimal - ) - except UnicodeError as err: - raise BadlyFormedJSONErrorUTF8(*err.args) - except ValueError as err: - raise BadlyFormedJSONError(*err.args) - else: - self.root_json_dict = root_json_dict + if not json_filename: + if self.root_list_path is None: + self.root_json_list = root_json_dict + else: + self.root_json_list = path_search( + root_json_dict, self.root_list_path.split("/") + ) if preserve_fields: # Extract fields to be preserved from input file (one path per line) @@ -240,19 +266,37 @@ def __init__( self.preserve_fields = None self.preserve_fields_input = None + if json_filename: + if self.root_list_path is None: + path = "item" + else: + path = root_list_path.replace("/", ".") + ".item" + + json_file = codecs.open(json_filename, encoding="utf-8") + + self.root_json_list = ijson.items(json_file, path, map_type=OrderedDict) + + try: + self.parse() + except ijson.common.IncompleteJSONError as err: + raise BadlyFormedJSONError(*err.args) + except UnicodeDecodeError as err: + raise BadlyFormedJSONErrorUTF8(*err.args) + finally: + if json_filename: + json_file.close() + def parse(self): - if self.root_list_path is None: - root_json_list = self.root_json_dict - else: - root_json_list = path_search( - self.root_json_dict, self.root_list_path.split("/") - ) - for json_dict in root_json_list: + for num, json_dict in enumerate(self.root_json_list): if json_dict is None: # This is particularly useful for IATI XML, in order to not # fall over on empty activity, e.g. continue self.parse_json_dict(json_dict, sheet=self.main_sheet) + if num % 2000 == 0 and num != 0: + transaction.commit() + + transaction.commit() if self.remove_empty_schema_columns: # Remove sheets with no lines of data @@ -501,7 +545,9 @@ def parse_json_dict( parent_name, key, truncation_length=self.truncation_length ) if sub_sheet_name not in self.sub_sheets: - self.sub_sheets[sub_sheet_name] = Sheet(name=sub_sheet_name) + self.sub_sheets[sub_sheet_name] = PersistentSheet( + name=sub_sheet_name, connection=self.connection + ) for json_dict in value: if json_dict is None: @@ -518,4 +564,16 @@ def parse_json_dict( raise ValueError(_("Unsupported type {}").format(type(value))) if top: - sheet.lines.append(flattened_dict) + sheet.append_line(flattened_dict) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + if self.persist: + self.connection.close() + self.db.close() + os.remove(self.zodb_db_location) + os.remove(self.zodb_db_location + ".lock") + os.remove(self.zodb_db_location + ".index") + os.remove(self.zodb_db_location + ".tmp") diff --git a/flattentool/output.py b/flattentool/output.py index b92b0d02..947ceac6 100644 --- a/flattentool/output.py +++ b/flattentool/output.py @@ -50,7 +50,7 @@ def close(self): class XLSXOutput(SpreadsheetOutput): def open(self): - self.workbook = openpyxl.Workbook() + self.workbook = openpyxl.Workbook(write_only=True) def write_sheet(self, sheet_name, sheet): sheet_header = list(sheet) @@ -75,7 +75,6 @@ def write_sheet(self, sheet_name, sheet): worksheet.append(line) def close(self): - self.workbook.remove(self.workbook.active) self.workbook.save(self.output_name) diff --git a/flattentool/sheet.py b/flattentool/sheet.py index 05f2159a..df6b99be 100644 --- a/flattentool/sheet.py +++ b/flattentool/sheet.py @@ -1,3 +1,8 @@ +import copy + +import BTrees.IOBTree + + class Sheet(object): """ An abstract representation of a single sheet of a spreadsheet. @@ -8,10 +13,14 @@ def __init__(self, columns=None, root_id="", name=None): self.id_columns = [] self.columns = columns if columns else [] self.titles = {} - self.lines = [] + self._lines = [] self.root_id = root_id self.name = name + @property + def lines(self): + return self._lines + def add_field(self, field, id_field=False): columns = self.id_columns if id_field else self.columns if field not in columns: @@ -27,3 +36,39 @@ def __iter__(self): yield column for column in self.columns: yield column + + def append_line(self, flattened_dict): + self._lines.append(flattened_dict) + + +class PersistentSheet(Sheet): + """ + A sheet that is persisted in ZODB database. + + """ + + def __init__(self, columns=None, root_id="", name=None, connection=None): + super().__init__(columns=columns, root_id=root_id, name=name) + self.connection = connection + self.index = 0 + connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree() + + @property + def lines(self): + for key, value in self.connection.root.sheet_store[self.name].items(): + if key % 5000 == 0: + self.connection.cacheMinimize() + yield value + + def append_line(self, flattened_dict): + self.connection.root.sheet_store[self.name][self.index] = flattened_dict + self.index += 1 + + @classmethod + def from_sheet(cls, sheet, connection): + instance = cls(name=sheet.name, connection=connection) + instance.id_columns = copy.deepcopy(sheet.id_columns) + instance.columns = copy.deepcopy(sheet.columns) + instance.titles = copy.deepcopy(sheet.titles) + instance.root_id = sheet.root_id + return instance diff --git a/flattentool/tests/test_json_input.py b/flattentool/tests/test_json_input.py index 738d36bd..35357863 100644 --- a/flattentool/tests/test_json_input.py +++ b/flattentool/tests/test_json_input.py @@ -59,30 +59,29 @@ def test_jsonparser_arguments_exceptions(tmpdir): def test_json_filename(tmpdir): test_json = tmpdir.join("test.json") - test_json.write('{"a":"b"}') + test_json.write('[{"a":"b"}]') parser = JSONParser(json_filename=test_json.strpath) - assert parser.root_json_dict == {"a": "b"} + assert list(parser.main_sheet.lines) == [{"a": "b"}] def test_json_filename_utf8(tmpdir): test_json = tmpdir.join("test.json") - test_json.write_text('{"a":"éαГ😼𝒞人"}', encoding="utf-8") + test_json.write_text('[{"a":"éαГ😼𝒞人"}]', encoding="utf-8") parser = JSONParser(json_filename=test_json.strpath) - assert parser.root_json_dict == {"a": "éαГ😼𝒞人"} + assert list(parser.main_sheet.lines) == [{"a": "éαГ😼𝒞人"}] def test_json_filename_ordered(tmpdir): test_json = tmpdir.join("test.json") - test_json.write('{"a":"b", "c": "d"}') + test_json.write('[{"a":"b", "c": "d"}]') parser = JSONParser(json_filename=test_json.strpath) - assert list(parser.root_json_dict.items()) == [("a", "b"), ("c", "d")] + assert list(parser.main_sheet.lines) == [{"a": "b", "c": "d"}] def test_parse_empty_json_dict(): parser = JSONParser(root_json_dict={}) - parser.parse() assert list(parser.main_sheet) == [] - assert parser.main_sheet.lines == [] + assert list(parser.main_sheet.lines) == [] assert parser.sub_sheets == {} @@ -93,9 +92,8 @@ def test_parse_basic_json_dict(): OrderedDict([("a", "e"), ("c", "f"),]), ] ) - parser.parse() assert list(parser.main_sheet) == ["a", "c"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"a": "b", "c": "d"}, {"a": "e", "c": "f"}, ] @@ -106,9 +104,8 @@ def test_parse_nested_dict_json_dict(): parser = JSONParser( root_json_dict=[OrderedDict([("a", "b"), ("c", OrderedDict([("d", "e")])),])] ) - parser.parse() assert list(parser.main_sheet) == ["a", "c/d"] - assert parser.main_sheet.lines == [{"a": "b", "c/d": "e"}] + assert list(parser.main_sheet.lines) == [{"a": "b", "c/d": "e"}] assert parser.sub_sheets == {} @@ -116,9 +113,8 @@ def test_parse_nested_list_json_dict(): parser = JSONParser( root_json_dict=[OrderedDict([("a", "b"), ("c", [OrderedDict([("d", "e")])]),])] ) - parser.parse() assert list(parser.main_sheet) == ["a"] - assert parser.main_sheet.lines == [{"a": "b"}] + assert list(parser.main_sheet.lines) == [{"a": "b"}] listify(parser.sub_sheets) == {"c": ["d"]} parser.sub_sheets["c"].lines == [{"d": "e"}] @@ -127,9 +123,8 @@ def test_parse_array(): parser = JSONParser( root_json_dict=[OrderedDict([("testarray", ["item", "anotheritem", 42])])] ) - parser.parse() assert list(parser.main_sheet) == ["testarray"] - assert parser.main_sheet.lines == [{"testarray": "item;anotheritem;42"}] + assert list(parser.main_sheet.lines) == [{"testarray": "item;anotheritem;42"}] assert parser.sub_sheets == {} @@ -138,9 +133,8 @@ def test_root_list_path(): root_json_dict={"custom_key": [OrderedDict([("a", "b"), ("c", "d"),])]}, root_list_path="custom_key", ) - parser.parse() assert list(parser.main_sheet) == ["a", "c"] - assert parser.main_sheet.lines == [{"a": "b", "c": "d"}] + assert list(parser.main_sheet.lines) == [{"a": "b", "c": "d"}] assert parser.sub_sheets == {} @@ -169,11 +163,12 @@ def test_parse_ids(self): ], root_id="ocid", ) - parser.parse() assert list(parser.main_sheet) == ["ocid", "id", "a", "f/g"] - assert parser.main_sheet.lines == [{"ocid": 1, "id": 2, "a": "b", "f/g": "h"}] + assert list(parser.main_sheet.lines) == [ + {"ocid": 1, "id": 2, "a": "b", "f/g": "h"} + ] listify(parser.sub_sheets) == {"c": ["ocid", "id", "c/0/id", "c/0/d"]} - assert parser.sub_sheets["c"].lines == [ + assert list(parser.sub_sheets["c"].lines) == [ {"ocid": 1, "id": 2, "c/0/id": 3, "c/0/d": "e"}, {"ocid": 1, "id": 2, "c/0/id": 3, "c/0/d": "e2"}, ] @@ -212,9 +207,8 @@ def test_parse_ids_subsheet(self): ], root_id="ocid", ) - parser.parse() assert list(parser.main_sheet) == ["ocid", "id"] - assert parser.main_sheet.lines == [{"ocid": 1, "id": 2,}] + assert list(parser.main_sheet.lines) == [{"ocid": 1, "id": 2,}] assert listify(parser.sub_sheets) == { "testnest": [ "ocid", @@ -225,7 +219,7 @@ def test_parse_ids_subsheet(self): ], "tes_c": ["ocid", "id", "testnest/0/id", "testnest/0/c/0/d"], } - assert parser.sub_sheets["testnest"].lines == [ + assert list(parser.sub_sheets["testnest"].lines) == [ { "ocid": 1, "id": 2, @@ -234,7 +228,7 @@ def test_parse_ids_subsheet(self): "testnest/0/f/g": "h", }, ] - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"ocid": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"}, {"ocid": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"}, ] @@ -271,15 +265,14 @@ def test_parse_ids_nested(self): ], root_id="ocid", ) - parser.parse() assert list(parser.main_sheet) == ["ocid", "id", "a", "testnest/id", "f/g"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"ocid": 1, "id": 2, "a": "b", "testnest/id": 3, "f/g": "h"} ] assert listify(parser.sub_sheets) == { "tes_c": ["ocid", "id", "testnest/id", "testnest/c/0/d"] } - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"ocid": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e"}, {"ocid": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"}, ] @@ -326,9 +319,8 @@ def test_sub_sheets(self, tmpdir, remove_empty_schema_columns): schema_parser=schema_parser, remove_empty_schema_columns=remove_empty_schema_columns, ) - parser.parse() assert list(parser.main_sheet) == ["a"] - assert parser.main_sheet.lines == [{"a": "b"}] + assert list(parser.main_sheet.lines) == [{"a": "b"}] assert len(parser.sub_sheets) == 2 if not remove_empty_schema_columns else 1 if not remove_empty_schema_columns: assert list(parser.sub_sheets["c"]) == list(["ocid", "c/0/d", "c/0/f"]) @@ -352,11 +344,10 @@ def test_column_matching(self, tmpdir): schema_parser = SchemaParser(schema_filename=test_schema.strpath) schema_parser.parse() parser = JSONParser( - root_json_dict=[OrderedDict([("c", ["d"]),])], schema_parser=schema_parser + root_json_dict=[OrderedDict([("c", ["d"]),])], schema_parser=schema_parser, ) - parser.parse() assert list(parser.main_sheet) == ["c"] - assert parser.main_sheet.lines == [{"c": "d"}] + assert list(parser.main_sheet.lines) == [{"c": "d"}] assert len(parser.sub_sheets) == 0 def test_rollup(self): @@ -390,9 +381,8 @@ def test_rollup(self): root_id="ocid", rollup=True, ) - parser.parse() assert list(parser.main_sheet) == ["testA/0/testB"] - assert parser.main_sheet.lines == [{"testA/0/testB": "1"}] + assert list(parser.main_sheet.lines) == [{"testA/0/testB": "1"}] assert len(parser.sub_sheets) == 1 assert set(parser.sub_sheets["testA"]) == set( ["ocid", "testA/0/testB", "testA/0/testC"] @@ -438,9 +428,8 @@ def test_rollup_multiple_values(self, recwarn): schema_parser=schema_parser, rollup=True, ) - parser.parse() assert list(parser.main_sheet) == ["testA/0/testB"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ { "testA/0/testB": "WARNING: More than one value supplied, consult the relevant sub-sheet for the data." } @@ -502,7 +491,6 @@ def test_two_parents(self): ], schema_parser=schema_parser, ) - parser.parse() assert set(parser.main_sheet) == set() assert set(parser.sub_sheets) == set( ["Atest", "Dtest", "Ate_Btest", "Dte_Btest"] @@ -547,11 +535,12 @@ def test_parse_ids(self): ], root_id="custom", ) - parser.parse() assert list(parser.main_sheet) == ["custom", "id", "a", "f/g"] - assert parser.main_sheet.lines == [{"custom": 1, "id": 2, "a": "b", "f/g": "h"}] + assert list(parser.main_sheet.lines) == [ + {"custom": 1, "id": 2, "a": "b", "f/g": "h"} + ] assert listify(parser.sub_sheets) == {"c": ["custom", "id", "c/0/id", "c/0/d"]} - assert parser.sub_sheets["c"].lines == [ + assert list(parser.sub_sheets["c"].lines) == [ {"custom": 1, "id": 2, "c/0/id": 3, "c/0/d": "e"}, {"custom": 1, "id": 2, "c/0/id": 3, "c/0/d": "e2"}, ] @@ -590,9 +579,8 @@ def test_parse_ids_subsheet(self): ], root_id="custom", ) - parser.parse() assert list(parser.main_sheet) == ["custom", "id"] - assert parser.main_sheet.lines == [{"custom": 1, "id": 2,}] + assert list(parser.main_sheet.lines) == [{"custom": 1, "id": 2,}] assert listify(parser.sub_sheets) == { "testnest": [ "custom", @@ -603,7 +591,7 @@ def test_parse_ids_subsheet(self): ], "tes_c": ["custom", "id", "testnest/0/id", "testnest/0/c/0/d"], } - assert parser.sub_sheets["testnest"].lines == [ + assert list(parser.sub_sheets["testnest"].lines) == [ { "custom": 1, "id": 2, @@ -612,7 +600,7 @@ def test_parse_ids_subsheet(self): "testnest/0/f/g": "h", }, ] - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"}, {"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"}, ] @@ -649,15 +637,14 @@ def test_parse_ids_nested(self): ], root_id="custom", ) - parser.parse() assert list(parser.main_sheet) == ["custom", "id", "a", "testnest/id", "f/g"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"custom": 1, "id": 2, "a": "b", "testnest/id": 3, "f/g": "h"} ] assert listify(parser.sub_sheets) == { "tes_c": ["custom", "id", "testnest/id", "testnest/c/0/d"] } - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"custom": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e"}, {"custom": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"}, ] @@ -687,11 +674,10 @@ def test_parse_ids(self): ], root_id="", ) - parser.parse() assert list(parser.main_sheet) == ["id", "a", "f/g"] - assert parser.main_sheet.lines == [{"id": 2, "a": "b", "f/g": "h"}] + assert list(parser.main_sheet.lines) == [{"id": 2, "a": "b", "f/g": "h"}] assert listify(parser.sub_sheets) == {"c": ["id", "c/0/id", "c/0/d"]} - assert parser.sub_sheets["c"].lines == [ + assert list(parser.sub_sheets["c"].lines) == [ {"id": 2, "c/0/id": 3, "c/0/d": "e"}, {"id": 2, "c/0/id": 3, "c/0/d": "e2"}, ] @@ -729,17 +715,16 @@ def test_parse_ids_subsheet(self): ], root_id="", ) - parser.parse() assert list(parser.main_sheet) == ["id"] - assert parser.main_sheet.lines == [{"id": 2,}] + assert list(parser.main_sheet.lines) == [{"id": 2,}] assert listify(parser.sub_sheets) == { "testnest": ["id", "testnest/0/id", "testnest/0/a", "testnest/0/f/g"], "tes_c": ["id", "testnest/0/id", "testnest/0/c/0/d"], } - assert parser.sub_sheets["testnest"].lines == [ + assert list(parser.sub_sheets["testnest"].lines) == [ {"id": 2, "testnest/0/id": 3, "testnest/0/a": "b", "testnest/0/f/g": "h",}, ] - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"}, {"id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"}, ] @@ -775,15 +760,14 @@ def test_parse_ids_nested(self): ], root_id="", ) - parser.parse() assert list(parser.main_sheet) == ["id", "a", "testnest/id", "f/g"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"id": 2, "a": "b", "testnest/id": 3, "f/g": "h"} ] assert listify(parser.sub_sheets) == { "tes_c": ["id", "testnest/id", "testnest/c/0/d"] } - assert parser.sub_sheets["tes_c"].lines == [ + assert list(parser.sub_sheets["tes_c"].lines) == [ {"id": 2, "testnest/id": 3, "testnest/c/0/d": "e"}, {"id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"}, ] diff --git a/flattentool/tests/test_json_input_is_unflatten_reversed.py b/flattentool/tests/test_json_input_is_unflatten_reversed.py index cdd6a9a5..3007e2e2 100644 --- a/flattentool/tests/test_json_input_is_unflatten_reversed.py +++ b/flattentool/tests/test_json_input_is_unflatten_reversed.py @@ -80,7 +80,6 @@ def test_flatten( schema_parser=schema_parser, **extra_kwargs ) - parser.parse() expected_output_list = [ inject_root_id(root_id, expected_output_dict) @@ -188,7 +187,6 @@ def test_flatten_multiplesheets( schema_parser=schema_parser, **extra_kwargs ) - parser.parse() expected_output_dict = OrderedDict( [ @@ -197,11 +195,11 @@ def test_flatten_multiplesheets( ] ) output = { - sheet_name: sheet.lines + sheet_name: list(sheet.lines) for sheet_name, sheet in parser.sub_sheets.items() - if sheet.lines + if list(sheet.lines) } - output["custom_main"] = parser.main_sheet.lines + output["custom_main"] = list(parser.main_sheet.lines) assert output == expected_output_dict diff --git a/flattentool/tests/test_output.py b/flattentool/tests/test_output.py index 023ce09b..ea47407b 100644 --- a/flattentool/tests/test_output.py +++ b/flattentool/tests/test_output.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import os +import sys import openpyxl import pytest @@ -41,7 +42,10 @@ def test_blank_sheets(tmpdir): wb = openpyxl.load_workbook(tmpdir.join("release.xlsx").strpath) assert wb.sheetnames == ["release"] rows = list(wb["release"].rows) - assert len(rows) == 0 + # openpyxl fixed this bug but earler versions of python are stuck with it. + # remove when we no longer support 3.5 + if sys.version_info >= (3, 6, 0): + assert len(rows) == 0 # Check CSV is Empty assert tmpdir.join("release").listdir() == [ @@ -102,7 +106,7 @@ def test_empty_lines(tmpdir): subsheet = Sheet(root_id="ocid") subsheet.add_field("c") parser = MockParser(["a", "d"], {"b": subsheet}) - parser.main_sheet.lines = [] + parser.main_sheet._lines = [] for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, @@ -147,8 +151,8 @@ def test_populated_lines(tmpdir): subsheet = Sheet(root_id="ocid") subsheet.add_field("c") parser = MockParser(["a"], {}) - parser.main_sheet.lines = [{"a": "cell1"}, {"a": "cell2"}] - subsheet.lines = [{"c": "cell3"}, {"c": "cell4"}] + parser.main_sheet._lines = [{"a": "cell1"}, {"a": "cell2"}] + subsheet._lines = [{"c": "cell3"}, {"c": "cell4"}] parser.sub_sheets["b"] = subsheet for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( @@ -206,7 +210,7 @@ def test_populated_lines(tmpdir): def test_utf8(tmpdir): parser = MockParser(["é"], {}) - parser.main_sheet.lines = [{"é": "éαГ😼𝒞人"}, {"é": "cell2"}] + parser.main_sheet._lines = [{"é": "éαГ😼𝒞人"}, {"é": "cell2"}] for format_name, spreadsheet_output_class in output.FORMATS.items(): spreadsheet_output = spreadsheet_output_class( parser=parser, diff --git a/flattentool/tests/test_xml_input.py b/flattentool/tests/test_xml_input.py index 4ab90784..d0539749 100644 --- a/flattentool/tests/test_xml_input.py +++ b/flattentool/tests/test_xml_input.py @@ -15,9 +15,8 @@ def test_xml_empty(): xml=True, id_name="iati-identifier", ) - parser.parse() assert list(parser.main_sheet) == [] - assert parser.main_sheet.lines == [] + assert list(parser.main_sheet.lines) == [] assert parser.sub_sheets == {} @@ -30,7 +29,6 @@ def test_xml_basic_example(): xml=True, id_name="iati-identifier", ) - parser.parse() assert list(parser.main_sheet) == [ "iati-identifier", "reporting-org/@ref", @@ -44,7 +42,7 @@ def test_xml_basic_example(): "activity-date/@iso-date", "activity-date/@type", ] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ { "activity-date/@type": "1", "reporting-org/narrative": "Organisation name", @@ -80,7 +78,7 @@ def test_xml_basic_example(): "transaction/0/value/@value-date", "transaction/0/value", ] - assert parser.sub_sheets["transaction"].lines == [ + assert list(parser.sub_sheets["transaction"].lines) == [ { "transaction/0/value/@value-date": "2012-01-01", "iati-identifier": "AA-AAA-123456789-ABC123", @@ -115,7 +113,7 @@ def test_xml_basic_example(): "recipient-country/0/@code", "recipient-country/0/@percentage", ] - assert parser.sub_sheets["recipient-country"].lines == [ + assert list(parser.sub_sheets["recipient-country"].lines) == [ { "iati-identifier": "AA-AAA-123456789-ABC123", "recipient-country/0/@code": "AF", @@ -148,9 +146,8 @@ def test_varyin_transaction_count(): xml=True, id_name="iati-identifier", ) - parser.parse() assert list(parser.main_sheet) == ["iati-identifier"] - assert parser.main_sheet.lines == [ + assert list(parser.main_sheet.lines) == [ {"iati-identifier": "AA-AAA-123456789-ABC123"}, {"iati-identifier": "AA-AAA-123456789-ABC124"}, {"iati-identifier": "AA-AAA-123456789-ABC125"}, @@ -162,7 +159,7 @@ def test_varyin_transaction_count(): "transaction/0/value/@value-date", "transaction/0/value", ] - assert parser.sub_sheets["transaction"].lines == [ + assert list(parser.sub_sheets["transaction"].lines) == [ { "iati-identifier": "AA-AAA-123456789-ABC123", "transaction/0/value/@value-date": "2012-01-01", @@ -251,16 +248,15 @@ def test_list_dict_consistency(): def test_xml_whitespace(): - parser = JSONParser( - json_filename="flattentool/tests/fixtures/narrative_whitespace.xml", - root_list_path="iati-activity", - schema_parser=None, - root_id="", - xml=True, - id_name="iati-identifier", - ) - try: - parser.parse() + parser = JSONParser( + json_filename="flattentool/tests/fixtures/narrative_whitespace.xml", + root_list_path="iati-activity", + schema_parser=None, + root_id="", + xml=True, + id_name="iati-identifier", + ) + assert parser except TypeError as e: raise e diff --git a/setup.py b/setup.py index 6379e337..1202823b 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,9 @@ def run(self): "xmltodict", "lxml", "odfpy", + "zodb", + "zc.zlibstorage", + "ijson", ] setup( From 48daad257cddf3a494d5abe16b939dd6d32c1d38 Mon Sep 17 00:00:00 2001 From: David Raznick Date: Tue, 9 Feb 2021 11:12:43 +0000 Subject: [PATCH 2/2] Unflatten: Stream all unflattening. * Uses openpyxl read_only mode * Uses zodb storage to save incoming data into buckets based on id_name in top level objects. * Any object without id_name gets given random key. * Runs unflatten seperately on all these buckets. * For JSON use jsonstreams to stream output into both result JSON and cell_source_map JSON. These are the only two files that are likely to be large. * For XML use lxml xmlfile to stream unflattened xmldata. --- examples/bods/unflatten/expected/out.json | 38 +- .../relationship-missing-ids/expected.json | 8 +- examples/iati/expected.xml | 113 ++- examples/iati_multilang/expected.xml | 117 ++- examples/iati_xml_comment/expected.xml | 117 ++- .../source-map/expected/cell_source_map.json | 674 +++++++++--------- flatten-tool | 1 + flattentool/__init__.py | 250 +++++-- flattentool/input.py | 114 ++- flattentool/tests/fixtures/iati-org.xml | 13 +- flattentool/tests/test_docs.py | 2 +- flattentool/tests/test_init.py | 12 +- flattentool/xml_output.py | 27 + setup.py | 1 + 14 files changed, 870 insertions(+), 617 deletions(-) diff --git a/examples/bods/unflatten/expected/out.json b/examples/bods/unflatten/expected/out.json index ed3c55d0..a51b1103 100644 --- a/examples/bods/unflatten/expected/out.json +++ b/examples/bods/unflatten/expected/out.json @@ -1,24 +1,24 @@ [ { - "statementID": "fbfd0547-d0c6-4a00-b559-5c5e91c34f5c", - "interests": [ - { - "type": "shareholding", - "interestLevel": "direct", - "beneficialOwnershipOrControl": true, - "startDate": "2016-04-06", - "share": { - "exact": 100 - } + "statementID": "fbfd0547-d0c6-4a00-b559-5c5e91c34f5c", + "interests": [ + { + "type": "shareholding", + "interestLevel": "direct", + "beneficialOwnershipOrControl": true, + "startDate": "2016-04-06", + "share": { + "exact": 100 } - ], - "statementType": "ownershipOrControlStatement", - "statementDate": "2017-11-18", - "subject": { - "describedByEntityStatement": "1dc0e987-5c57-4a1c-b3ad-61353b66a9b7" - }, - "interestedParty": { - "describedByPersonStatement": "019a93f1-e470-42e9-957b-03559861b2e2" } + ], + "statementType": "ownershipOrControlStatement", + "statementDate": "2017-11-18", + "subject": { + "describedByEntityStatement": "1dc0e987-5c57-4a1c-b3ad-61353b66a9b7" + }, + "interestedParty": { + "describedByPersonStatement": "019a93f1-e470-42e9-957b-03559861b2e2" } -] \ No newline at end of file +} +] diff --git a/examples/cafe/relationship-missing-ids/expected.json b/examples/cafe/relationship-missing-ids/expected.json index 534569ed..c7beaa11 100644 --- a/examples/cafe/relationship-missing-ids/expected.json +++ b/examples/cafe/relationship-missing-ids/expected.json @@ -16,6 +16,10 @@ } ] }, + { + "name": "Vegetarian Cafe", + "address": "42 Town Road, Bristol" + }, { "id": "CAFE-VEG", "table": [ @@ -24,10 +28,6 @@ } ] }, - { - "name": "Vegetarian Cafe", - "address": "42 Town Road, Bristol" - }, { "table": [ { diff --git a/examples/iati/expected.xml b/examples/iati/expected.xml index 3677d49a..51cd0495 100644 --- a/examples/iati/expected.xml +++ b/examples/iati/expected.xml @@ -1,58 +1,57 @@ - - - - AA-AAA-123456789-ABC123 - - Organisation name - - - <narrative>A title</narrative> - - - A description - - - - - - - - - - 10 - - - - - 20 - - - - AA-AAA-123456789-ABC124 - - Organisation name - - - <narrative>Another title</narrative> - - - Another description - - - - - - - - - - 30 - - - - - 40 - - - + + + AA-AAA-123456789-ABC123 + + Organisation name + + + <narrative>A title</narrative> + + + A description + + + + + + + + + + 10 + + + + + 20 + + + + AA-AAA-123456789-ABC124 + + Organisation name + + + <narrative>Another title</narrative> + + + Another description + + + + + + + + + + 30 + + + + + 40 + + + \ No newline at end of file diff --git a/examples/iati_multilang/expected.xml b/examples/iati_multilang/expected.xml index e0c34da0..4949f699 100644 --- a/examples/iati_multilang/expected.xml +++ b/examples/iati_multilang/expected.xml @@ -1,60 +1,59 @@ - - - - AA-AAA-123456789-ABC123 - - Organisation name - - - <narrative xml:lang="en">A title, with comma</narrative> - <narrative xml:lang="fr">Un titre</narrative> - - - A description - - - - - - - - - - 10 - - - - - 20 - - - - AA-AAA-123456789-ABC124 - - Organisation name - - - <narrative xml:lang="en">Another title; with semicolon</narrative> - <narrative xml:lang="fr">Un autre titre</narrative> - - - Another description - - - - - - - - - - 30 - - - - - 40 - - - + + + AA-AAA-123456789-ABC123 + + Organisation name + + + <narrative xml:lang="en">A title, with comma</narrative> + <narrative xml:lang="fr">Un titre</narrative> + + + A description + + + + + + + + + + 10 + + + + + 20 + + + + AA-AAA-123456789-ABC124 + + Organisation name + + + <narrative xml:lang="en">Another title; with semicolon</narrative> + <narrative xml:lang="fr">Un autre titre</narrative> + + + Another description + + + + + + + + + + 30 + + + + + 40 + + + \ No newline at end of file diff --git a/examples/iati_xml_comment/expected.xml b/examples/iati_xml_comment/expected.xml index 8d131cd9..bc8305eb 100644 --- a/examples/iati_xml_comment/expected.xml +++ b/examples/iati_xml_comment/expected.xml @@ -1,60 +1,59 @@ - - - - AA-AAA-123456789-ABC123 - - Organisation name - - - <narrative xml:lang="en">A title, with comma</narrative> - <narrative xml:lang="fr">Un titre</narrative> - - - A description - - - - - - - - - - 10 - - - - - 20 - - - - AA-AAA-123456789-ABC124 - - Organisation name - - - <narrative xml:lang="en">Another title; with semicolon</narrative> - <narrative xml:lang="fr">Un autre titre</narrative> - - - Another description - - - - - - - - - - 30 - - - - - 40 - - - + + + AA-AAA-123456789-ABC123 + + Organisation name + + + <narrative xml:lang="en">A title, with comma</narrative> + <narrative xml:lang="fr">Un titre</narrative> + + + A description + + + + + + + + + + 10 + + + + + 20 + + + + AA-AAA-123456789-ABC124 + + Organisation name + + + <narrative xml:lang="en">Another title; with semicolon</narrative> + <narrative xml:lang="fr">Un autre titre</narrative> + + + Another description + + + + + + + + + + 30 + + + + + 40 + + + \ No newline at end of file diff --git a/examples/receipt/source-map/expected/cell_source_map.json b/examples/receipt/source-map/expected/cell_source_map.json index e4ac98cc..fa16d4ff 100644 --- a/examples/receipt/source-map/expected/cell_source_map.json +++ b/examples/receipt/source-map/expected/cell_source_map.json @@ -1,374 +1,374 @@ { "cafe/0/id": [ - [ - "1_cafes", - "A", - 2, - "id" - ], - [ - "2_tables", - "A", - 2, - "id" - ], - [ - "2_tables", - "A", - 3, - "id" - ], - [ - "2_tables", - "A", - 4, - "id" - ], - [ - "3_dishes", - "A", - 2, - "id" - ], - [ - "3_dishes", - "A", - 3, - "id" - ], - [ - "3_dishes", - "A", - 4, - "id" - ] - ], + [ + "1_cafes", + "A", + 2, + "id" + ], + [ + "2_tables", + "A", + 2, + "id" + ], + [ + "2_tables", + "A", + 3, + "id" + ], + [ + "2_tables", + "A", + 4, + "id" + ], + [ + "3_dishes", + "A", + 2, + "id" + ], + [ + "3_dishes", + "A", + 3, + "id" + ], + [ + "3_dishes", + "A", + 4, + "id" + ] +], "cafe/0/name": [ - [ - "1_cafes", - "B", - 2, - "name" - ] - ], + [ + "1_cafes", + "B", + 2, + "name" + ] +], "cafe/0/table/0/dish/0/cost": [ - [ - "3_dishes", - "D", - 2, - "table/0/dish/0/cost" - ] - ], + [ + "3_dishes", + "D", + 2, + "table/0/dish/0/cost" + ] +], "cafe/0/table/0/dish/0/name": [ - [ - "3_dishes", - "C", - 2, - "table/0/dish/0/name" - ] - ], + [ + "3_dishes", + "C", + 2, + "table/0/dish/0/name" + ] +], "cafe/0/table/0/dish/1/cost": [ - [ - "3_dishes", - "D", - 3, - "table/0/dish/0/cost" - ] - ], + [ + "3_dishes", + "D", + 3, + "table/0/dish/0/cost" + ] +], "cafe/0/table/0/dish/1/name": [ - [ - "3_dishes", - "C", - 3, - "table/0/dish/0/name" - ] - ], + [ + "3_dishes", + "C", + 3, + "table/0/dish/0/name" + ] +], "cafe/0/table/0/id": [ - [ - "2_tables", - "B", - 2, - "table/0/id" - ], - [ - "3_dishes", - "B", - 2, - "table/0/id" - ], - [ - "3_dishes", - "B", - 3, - "table/0/id" - ] - ], + [ + "2_tables", + "B", + 2, + "table/0/id" + ], + [ + "3_dishes", + "B", + 2, + "table/0/id" + ], + [ + "3_dishes", + "B", + 3, + "table/0/id" + ] +], "cafe/0/table/0/number": [ - [ - "2_tables", - "C", - 2, - "table/0/number" - ] - ], + [ + "2_tables", + "C", + 2, + "table/0/number" + ] +], "cafe/0/table/1/id": [ - [ - "2_tables", - "B", - 3, - "table/0/id" - ] - ], + [ + "2_tables", + "B", + 3, + "table/0/id" + ] +], "cafe/0/table/1/number": [ - [ - "2_tables", - "C", - 3, - "table/0/number" - ] - ], + [ + "2_tables", + "C", + 3, + "table/0/number" + ] +], "cafe/0/table/2/dish/0/cost": [ - [ - "3_dishes", - "D", - 4, - "table/0/dish/0/cost" - ] - ], + [ + "3_dishes", + "D", + 4, + "table/0/dish/0/cost" + ] +], "cafe/0/table/2/dish/0/name": [ - [ - "3_dishes", - "C", - 4, - "table/0/dish/0/name" - ] - ], + [ + "3_dishes", + "C", + 4, + "table/0/dish/0/name" + ] +], "cafe/0/table/2/id": [ - [ - "2_tables", - "B", - 4, - "table/0/id" - ], - [ - "3_dishes", - "B", - 4, - "table/0/id" - ] - ], + [ + "2_tables", + "B", + 4, + "table/0/id" + ], + [ + "3_dishes", + "B", + 4, + "table/0/id" + ] +], "cafe/0/table/2/number": [ - [ - "2_tables", - "C", - 4, - "table/0/number" - ] - ], - "cafe/1/id": [ - [ - "1_cafes", - "A", - 3, - "id" - ], - [ - "2_tables", - "A", - 5, - "id" - ], - [ - "2_tables", - "A", - 6, - "id" - ], - [ - "3_dishes", - "A", - 5, - "id" - ] - ], - "cafe/1/name": [ - [ - "1_cafes", - "B", - 3, - "name" - ] + [ + "2_tables", + "C", + 4, + "table/0/number" + ] +], + "cafe/0": [ + [ + "1_cafes", + 2 ], - "cafe/1/table/0/dish/0/cost": [ - [ - "3_dishes", - "D", - 5, - "table/0/dish/0/cost" - ] + [ + "2_tables", + 2 ], - "cafe/1/table/0/dish/0/name": [ - [ - "3_dishes", - "C", - 5, - "table/0/dish/0/name" - ] + [ + "2_tables", + 3 ], - "cafe/1/table/0/id": [ - [ - "2_tables", - "B", - 5, - "table/0/id" - ], - [ - "3_dishes", - "B", - 5, - "table/0/id" - ] + [ + "2_tables", + 4 ], - "cafe/1/table/0/number": [ - [ - "2_tables", - "C", - 5, - "table/0/number" - ] + [ + "3_dishes", + 2 ], - "cafe/1/table/1/id": [ - [ - "2_tables", - "B", - 6, - "table/0/id" - ] - ], - "cafe/1/table/1/number": [ - [ - "2_tables", - "C", - 6, - "table/0/number" - ] - ], - "cafe/0": [ - [ - "1_cafes", - 2 - ], - [ - "2_tables", - 2 - ], - [ - "2_tables", - 3 - ], - [ - "2_tables", - 4 - ], - [ - "3_dishes", - 2 - ], - [ - "3_dishes", - 3 - ], - [ - "3_dishes", - 4 - ] + [ + "3_dishes", + 3 ], + [ + "3_dishes", + 4 + ] +], "cafe/0/table/0/dish/0": [ - [ - "3_dishes", - 2 - ] - ], + [ + "3_dishes", + 2 + ] +], "cafe/0/table/0/dish/1": [ - [ - "3_dishes", - 3 - ] - ], + [ + "3_dishes", + 3 + ] +], "cafe/0/table/0": [ - [ - "2_tables", - 2 - ], - [ - "3_dishes", - 2 - ], - [ - "3_dishes", - 3 - ] + [ + "2_tables", + 2 ], - "cafe/0/table/1": [ - [ - "2_tables", - 3 - ] + [ + "3_dishes", + 2 ], + [ + "3_dishes", + 3 + ] +], + "cafe/0/table/1": [ + [ + "2_tables", + 3 + ] +], "cafe/0/table/2/dish/0": [ - [ - "3_dishes", - 4 - ] - ], + [ + "3_dishes", + 4 + ] +], "cafe/0/table/2": [ - [ - "2_tables", - 4 - ], - [ - "3_dishes", - 4 - ] + [ + "2_tables", + 4 ], + [ + "3_dishes", + 4 + ] +], + "cafe/1/id": [ + [ + "1_cafes", + "A", + 3, + "id" + ], + [ + "2_tables", + "A", + 5, + "id" + ], + [ + "2_tables", + "A", + 6, + "id" + ], + [ + "3_dishes", + "A", + 5, + "id" + ] +], + "cafe/1/name": [ + [ + "1_cafes", + "B", + 3, + "name" + ] +], + "cafe/1/table/0/dish/0/cost": [ + [ + "3_dishes", + "D", + 5, + "table/0/dish/0/cost" + ] +], + "cafe/1/table/0/dish/0/name": [ + [ + "3_dishes", + "C", + 5, + "table/0/dish/0/name" + ] +], + "cafe/1/table/0/id": [ + [ + "2_tables", + "B", + 5, + "table/0/id" + ], + [ + "3_dishes", + "B", + 5, + "table/0/id" + ] +], + "cafe/1/table/0/number": [ + [ + "2_tables", + "C", + 5, + "table/0/number" + ] +], + "cafe/1/table/1/id": [ + [ + "2_tables", + "B", + 6, + "table/0/id" + ] +], + "cafe/1/table/1/number": [ + [ + "2_tables", + "C", + 6, + "table/0/number" + ] +], "cafe/1": [ - [ - "1_cafes", - 3 - ], - [ - "2_tables", - 5 - ], - [ - "2_tables", - 6 - ], - [ - "3_dishes", - 5 - ] + [ + "1_cafes", + 3 ], - "cafe/1/table/0/dish/0": [ - [ - "3_dishes", - 5 - ] + [ + "2_tables", + 5 + ], + [ + "2_tables", + 6 ], + [ + "3_dishes", + 5 + ] +], + "cafe/1/table/0/dish/0": [ + [ + "3_dishes", + 5 + ] +], "cafe/1/table/0": [ - [ - "2_tables", - 5 - ], - [ - "3_dishes", - 5 - ] + [ + "2_tables", + 5 ], + [ + "3_dishes", + 5 + ] +], "cafe/1/table/1": [ - [ - "2_tables", - 6 - ] + [ + "2_tables", + 6 ] +] } \ No newline at end of file diff --git a/flatten-tool b/flatten-tool index 328a7b7d..df6fda33 100755 --- a/flatten-tool +++ b/flatten-tool @@ -1,3 +1,4 @@ #!/usr/bin/env python import flattentool.cli + flattentool.cli.main() diff --git a/flattentool/__init__.py b/flattentool/__init__.py index b700353a..4486fedb 100644 --- a/flattentool/__init__.py +++ b/flattentool/__init__.py @@ -1,16 +1,24 @@ import codecs import json +import os import sys +import tempfile +import uuid from collections import OrderedDict from decimal import Decimal +import jsonstreams +import lxml.etree +import zc.zlibstorage +import ZODB.FileStorage + from flattentool.input import FORMATS as INPUT_FORMATS from flattentool.json_input import JSONParser from flattentool.lib import parse_sheet_configuration from flattentool.output import FORMATS as OUTPUT_FORMATS from flattentool.output import FORMATS_SUFFIX from flattentool.schema import SchemaParser -from flattentool.xml_output import toxml +from flattentool.xml_output import generate_schema_dict, write_comment, xml_item def create_template( @@ -179,7 +187,103 @@ def decimal_default(o): raise TypeError(repr(o) + " is not JSON serializable") +# This is to just to make ensure_ascii and default are correct for streaming library +class CustomJSONEncoder(json.JSONEncoder): + def __init__(self, **kw): + super().__init__(**kw) + # overwrie these no matter the input to __init__ + self.ensure_ascii = False + self.default = decimal_default + + +def get_output(output_name, xml=False): + if not output_name: + if xml: + return sys.stdout.buffer + else: + return sys.stdout + if xml: + return codecs.open(output_name, "wb") + return codecs.open(output_name, "w", encoding="utf-8") + + def unflatten( + input_name, + output_name=None, + cell_source_map=None, + root_is_list=False, + xml=False, + **kw +): + unflatten_kw = { + "output_name": output_name, + "cell_source_map": cell_source_map, + "root_is_list": root_is_list, + "xml": xml, + } + unflatten_kw.update(kw) + + zodb_db_location = tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4()) + zodb_storage = zc.zlibstorage.ZlibStorage( + ZODB.FileStorage.FileStorage(zodb_db_location) + ) + db = ZODB.DB(zodb_storage) + unflatten_kw["db"] = db + + try: + if xml: + with get_output(output_name, xml=True) as xml_file, lxml.etree.xmlfile( + xml_file, encoding="utf-8" + ) as xml_stream: + unflatten_kw["xml_stream"] = xml_stream + if cell_source_map: + with get_output( + cell_source_map + ) as cell_source_map_file, jsonstreams.Stream( + jsonstreams.Type.object, + fd=cell_source_map_file, + indent=4, + encoder=CustomJSONEncoder, + ) as cell_source_map_stream: + unflatten_kw["cell_source_map_stream"] = cell_source_map_stream + _unflatten(input_name, **unflatten_kw) + else: + _unflatten(input_name, **unflatten_kw) + + else: + json_stream_args = {"indent": 4, "encoder": CustomJSONEncoder} + if root_is_list: + json_stream_args["jtype"] = jsonstreams.Type.array + else: + json_stream_args["jtype"] = jsonstreams.Type.object + + with get_output(output_name) as json_file, jsonstreams.Stream( + fd=json_file, **json_stream_args + ) as json_stream: + unflatten_kw["json_stream"] = json_stream + if cell_source_map: + with get_output( + cell_source_map + ) as cell_source_map_file, jsonstreams.Stream( + jsonstreams.Type.object, + fd=cell_source_map_file, + indent=4, + encoder=CustomJSONEncoder, + ) as cell_source_map_stream: + unflatten_kw["cell_source_map_stream"] = cell_source_map_stream + _unflatten(input_name, **unflatten_kw) + else: + _unflatten(input_name, **unflatten_kw) + + finally: + db.close() + os.remove(zodb_db_location) + os.remove(zodb_db_location + ".lock") + os.remove(zodb_db_location + ".index") + os.remove(zodb_db_location + ".tmp") + + +def _unflatten( input_name, base_json=None, input_format=None, @@ -205,6 +309,10 @@ def unflatten( disable_local_refs=False, xml_comment=None, truncation_length=3, + json_stream=None, + cell_source_map_stream=None, + xml_stream=None, + db=None, **_ ): """ @@ -218,20 +326,18 @@ def unflatten( if metatab_name and base_json: raise Exception("Not allowed to use base_json with metatab") - if root_is_list: - base = None - elif base_json: + if not root_is_list and base_json: with open(base_json) as fp: base = json.load(fp, object_pairs_hook=OrderedDict) - else: - base = OrderedDict() + for key, value in base.items(): + json_stream.write(key, value) base_configuration = parse_sheet_configuration( [item.strip() for item in default_configuration.split(",")] ) - cell_source_map_data = OrderedDict() heading_source_map_data = OrderedDict() + meta_result = None if metatab_name and not root_is_list: spreadsheet_input_class = INPUT_FORMATS[input_format] @@ -255,7 +361,7 @@ def unflatten( spreadsheet_input.encoding = encoding spreadsheet_input.read_sheets() ( - result, + meta_result, cell_source_map_data_meta, heading_source_map_data_meta, ) = spreadsheet_input.fancy_unflatten( @@ -264,7 +370,9 @@ def unflatten( ) for key, value in (cell_source_map_data_meta or {}).items(): ## strip off meta/0/ from start of source map as actually data is at top level - cell_source_map_data[key[7:]] = value + if cell_source_map_stream: + cell_source_map_stream.write(key[7:], value) + for key, value in (heading_source_map_data_meta or {}).items(): ## strip off meta/ from start of source map as actually data is at top level heading_source_map_data[key[5:]] = value @@ -274,9 +382,6 @@ def unflatten( spreadsheet_input.sheet_configuration.get(metatab_name, {}) ) - if result: - base.update(result[0]) - if root_list_path is None: root_list_path = base_configuration.get("RootListPath", "main") if id_name is None: @@ -309,54 +414,85 @@ def unflatten( spreadsheet_input.parser = parser spreadsheet_input.encoding = encoding spreadsheet_input.read_sheets() - ( - result, - cell_source_map_data_main, - heading_source_map_data_main, - ) = spreadsheet_input.fancy_unflatten( - with_cell_source_map=cell_source_map, - with_heading_source_map=heading_source_map, - ) - cell_source_map_data.update(cell_source_map_data_main or {}) - heading_source_map_data.update(heading_source_map_data_main or {}) - if root_is_list: - base = list(result) - else: - base[root_list_path] = list(result) if xml: xml_root_tag = base_configuration.get("XMLRootTag", "iati-activities") - xml_output = toxml( - base, - xml_root_tag, - xml_schemas=xml_schemas, - root_list_path=root_list_path, - xml_comment=xml_comment, - ) - if output_name is None: - sys.stdout.buffer.write(xml_output) - else: - with codecs.open(output_name, "wb") as fp: - fp.write(xml_output) - else: - if output_name is None: - print( - json.dumps(base, indent=4, default=decimal_default, ensure_ascii=False) - ) - else: - with codecs.open(output_name, "w", encoding="utf-8") as fp: - json.dump( - base, fp, indent=4, default=decimal_default, ensure_ascii=False - ) - if cell_source_map: - with codecs.open(cell_source_map, "w", encoding="utf-8") as fp: - json.dump( - cell_source_map_data, - fp, - indent=4, - default=decimal_default, - ensure_ascii=False, - ) + + if not metatab_only: + xml_stream.write_declaration() + with xml_stream.element(xml_root_tag): + write_comment(xml_stream, xml_comment) + + for ( + single_result, + cell_source_map_data_main, + heading_source_map_data_main, + ) in spreadsheet_input.unflatten_with_storage( + with_cell_source_map=cell_source_map, + with_heading_source_map=heading_source_map, + db=db, + ): + + schema_dict = None + if xml_schemas: + schema_dict = generate_schema_dict(xml_schemas, root_list_path) + + for item in single_result: + xml_item(xml_stream, item, root_list_path, schema_dict) + + if cell_source_map_stream and cell_source_map_data_main: + for key, value in cell_source_map_data_main.items(): + cell_source_map_stream.write(key, value) + + for key, value in (heading_source_map_data_main or {}).items(): + if key in heading_source_map_data: + for item in heading_source_map_data_main[key]: + if item not in heading_source_map_data[key]: + heading_source_map_data[key].append(item) + else: + heading_source_map_data[key] = heading_source_map_data_main[ + key + ] + + if not xml: + if meta_result: + for key, value in meta_result[0].items(): + json_stream.write(key, value) + + if not metatab_only: + if not root_is_list: + list_stream = json_stream.subarray(root_list_path) + else: + list_stream = json_stream + + for ( + single_result, + cell_source_map_data_main, + heading_source_map_data_main, + ) in spreadsheet_input.unflatten_with_storage( + with_cell_source_map=cell_source_map, + with_heading_source_map=heading_source_map, + db=db, + ): + + if cell_source_map_stream and cell_source_map_data_main: + for key, value in cell_source_map_data_main.items(): + cell_source_map_stream.write(key, value) + + for item in single_result: + list_stream.write(item) + + for key, value in (heading_source_map_data_main or {}).items(): + if key in heading_source_map_data: + for item in heading_source_map_data_main[key]: + if item not in heading_source_map_data[key]: + heading_source_map_data[key].append(item) + else: + heading_source_map_data[key] = heading_source_map_data_main[key] + + if not root_is_list: + list_stream.close() + if heading_source_map: with codecs.open(heading_source_map, "w", encoding="utf-8") as fp: json.dump( diff --git a/flattentool/input.py b/flattentool/input.py index 62d59451..d09d7948 100644 --- a/flattentool/input.py +++ b/flattentool/input.py @@ -7,14 +7,19 @@ import datetime import os +import uuid from collections import OrderedDict, UserDict from csv import DictReader from csv import reader as csvreader from decimal import Decimal, InvalidOperation from warnings import warn +import BTrees import openpyxl +import persistent.list import pytz +import transaction +import ZODB from openpyxl.utils.cell import _get_column_letter from flattentool.exceptions import DataErrorWarning @@ -258,6 +263,7 @@ def __init__( xml=False, base_configuration={}, use_configuration=True, + persist=False, ): self.input_name = input_name self.root_list_path = root_list_path @@ -275,6 +281,8 @@ def __init__( self.base_configuration = base_configuration or {} self.sheet_configuration = {} self.use_configuration = use_configuration + self.persist = persist + self.actual_headings = {} def get_sub_sheets_lines(self): for sub_sheet_name in self.sub_sheet_names: @@ -306,13 +314,20 @@ def get_sheet_headings(self, sheet_name): def read_sheets(self): raise NotImplementedError - def do_unflatten(self): + def do_unflatten(self, sheet_lines=None): main_sheet_by_ocid = OrderedDict() - sheets = list(self.get_sub_sheets_lines()) + if sheet_lines: + sheets = sheet_lines.items() + else: + sheets = list(self.get_sub_sheets_lines()) for i, sheet in enumerate(sheets): sheet_name, lines = sheet try: - actual_headings = self.get_sheet_headings(sheet_name) + # cache headings + actual_headings = self.actual_headings.get(sheet_name) + if not actual_headings: + actual_headings = self.get_sheet_headings(sheet_name) + self.actual_headings[sheet_name] = actual_headings # If sheet is empty or too many lines have been skipped if not actual_headings: continue @@ -384,7 +399,14 @@ def do_unflatten(self): except NotImplementedError: # The ListInput type used in the tests doesn't support getting headings. actual_headings = None - for j, line in enumerate(lines): + + if not sheet_lines: + lines_generator = enumerate(lines) + else: + # when sheet lines are supplied then get sheet row numbers out of dictionary + lines_generator = sorted(list(lines.items())) + + for j, line in lines_generator: if all(x is None or x == "" for x in line.values()): # if all(x == '' for x in line.values()): continue @@ -452,14 +474,77 @@ def unflatten(self): result = extract_list_to_value(result) return result - def fancy_unflatten(self, with_cell_source_map, with_heading_source_map): - cell_tree = self.do_unflatten() + def unflatten_with_storage( + self, with_cell_source_map, with_heading_source_map, db=None + ): + + if not db: + # If None, in memory storage is used. + db = ZODB.DB(None) + + self.connection = db.open() + root = self.connection.root + + # Each top level object is assigned an integer. This way we preseve ordering as much as possible + root.object_store = BTrees.IOBTree.BTree() + + # this matches the top-level id field to its index value. + root.object_index = BTrees.OIBTree.BTree() + + index = 0 + + for sheet, rows in self.get_sub_sheets_lines(): + for row_numbar, row in enumerate(rows): + + ##uuid to stop clash with any key for objects with no id + top_level_id = row.get(self.id_name) or str(uuid.uuid4()) + + current_index = root.object_index.get(top_level_id) + + if current_index is None: + current_index = index + root.object_index[top_level_id] = current_index + root.object_store[current_index] = persistent.list.PersistentList() + index += 1 + + root.object_store[current_index].append((sheet, row_numbar, row)) + + if row_numbar != 0 and row_numbar % 1000 == 0: + transaction.commit() + self.connection.cacheMinimize() + + transaction.commit() + + self.connection.cacheMinimize() + + for current_index, row_list in root.object_store.items(): + sheet_lines = OrderedDict() + for sheet, row_numbar, row in row_list: + if sheet not in sheet_lines: + sheet_lines[sheet] = OrderedDict() + sheet_lines[sheet][row_numbar] = row + yield self.fancy_unflatten( + with_cell_source_map, with_cell_source_map, sheet_lines, current_index + ) + + self.connection.cacheMinimize() + + def fancy_unflatten( + self, + with_cell_source_map, + with_heading_source_map, + sheet_lines=None, + index=None, + ): + cell_tree = self.do_unflatten(sheet_lines=sheet_lines) result = extract_list_to_value(cell_tree) ordered_cell_source_map = None heading_source_map = None if with_cell_source_map or with_heading_source_map: cell_source_map = extract_list_to_error_path( - [] if self.root_is_list else [self.root_list_path], cell_tree + [] if self.root_is_list else [self.root_list_path], + cell_tree, + index=index, ) ordered_items = sorted(cell_source_map.items()) row_source_map = OrderedDict() @@ -502,10 +587,12 @@ def fancy_unflatten(self, with_cell_source_map, with_heading_source_map): return result, ordered_cell_source_map, heading_source_map -def extract_list_to_error_path(path, input): +def extract_list_to_error_path(path, input, index=None): output = {} + if index: + assert len(input) <= 1 for i, item in enumerate(input): - res = extract_dict_to_error_path(path + [i], item) + res = extract_dict_to_error_path(path + [index or i], item) for p in res: assert p not in output, _("Already have key {}").format(p) output[p] = res[p] @@ -652,7 +739,14 @@ class BadXLSXZipFile(BadZipFile): class XLSXInput(SpreadsheetInput): def read_sheets(self): try: - self.workbook = openpyxl.load_workbook(self.input_name, data_only=True) + if self.vertical_orientation: + # read_only mode only works when reading rows not columns + self.workbook = openpyxl.load_workbook(self.input_name, data_only=True) + else: + self.workbook = openpyxl.load_workbook( + self.input_name, data_only=True, read_only=True + ) + except BadZipFile as e: # noqa # TODO when we have python3 only add 'from e' to show exception chain raise BadXLSXZipFile( diff --git a/flattentool/tests/fixtures/iati-org.xml b/flattentool/tests/fixtures/iati-org.xml index 5b181d0e..2ff00249 100644 --- a/flattentool/tests/fixtures/iati-org.xml +++ b/flattentool/tests/fixtures/iati-org.xml @@ -1,8 +1,7 @@ - - - - AA-AAA-123456789 - - - + + + AA-AAA-123456789 + + + \ No newline at end of file diff --git a/flattentool/tests/test_docs.py b/flattentool/tests/test_docs.py index 86d156a7..a98c4191 100644 --- a/flattentool/tests/test_docs.py +++ b/flattentool/tests/test_docs.py @@ -151,7 +151,7 @@ def _simplify_line(line): def _strip(output): # Don't worry about any extra blank lines at the end either outstr = str(output, "utf8").rstrip("\n") - return "\n".join(line.rstrip(" ") for line in outstr.split("\n")) + return "\n".join(line.strip() for line in outstr.split("\n")) # Useful for a coverage check - see developer docs for how to run the check diff --git a/flattentool/tests/test_init.py b/flattentool/tests/test_init.py index 511404db..124f12eb 100644 --- a/flattentool/tests/test_init.py +++ b/flattentool/tests/test_init.py @@ -660,10 +660,8 @@ def test_unflatten(tmpdir): ] ] }""" - assert lines_strip_whitespace( - tmpdir.join("cell_source_map.json").read() - ) == lines_strip_whitespace(expected) data = json.loads(expected) + assert json.loads(tmpdir.join("cell_source_map.json").read()) == data cells, rows = original_cell_and_row_locations(data) # Make sure every cell in the original appeared in the cell source map exactly once assert cells == [ @@ -855,10 +853,8 @@ def test_unflatten(tmpdir): ] ] }""" - assert lines_strip_whitespace( - tmpdir.join("heading_source_map.json").read() - ) == lines_strip_whitespace(expected_headings) heading_data = json.loads(expected_headings) + assert json.loads(tmpdir.join("heading_source_map.json").read()) == heading_data headings = original_headings(heading_data) # Make sure every heading in the original appeared in the heading source map exactly once assert headings == [ @@ -997,7 +993,9 @@ def test_unflatten_empty(tmpdir): tmpdir.join("release.json").read() ) == lines_strip_whitespace( """{ - "main": [] + "main": [ + + ] }""" ) diff --git a/flattentool/xml_output.py b/flattentool/xml_output.py index a689d14c..5ed788ca 100644 --- a/flattentool/xml_output.py +++ b/flattentool/xml_output.py @@ -111,3 +111,30 @@ def toxml( ) else: return ET.tostring(root) + + +def write_comment(xml_stream, xml_comment): + if xml_comment is None: + xml_comment = "XML generated by flatten-tool" + if xml_comment: + xml_stream.write(ET.Comment(xml_comment), pretty_print=True) + + +def generate_schema_dict(xml_schemas, root_list_path): + return XMLSchemaWalker(xml_schemas).create_schema_dict(root_list_path) + + +def xml_item(xml_stream, data, root_list_path="iati-activity", schema_dict=None): + nsmap = { + # This is "bound by definition" - see https://www.w3.org/XML/1998/namespace + "xml": "http://www.w3.org/XML/1998/namespace" + } + + root = dict_to_xml(data, root_list_path, nsmap=nsmap) + if schema_dict: + sort_element(root, schema_dict) + + xml_stream.write(root, pretty_print=True) + root = None + + return schema_dict diff --git a/setup.py b/setup.py index 1202823b..24c37097 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ def run(self): "zodb", "zc.zlibstorage", "ijson", + "jsonstreams", ] setup(