diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 359a24eb..29fa04b0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,4 +24,4 @@ jobs:
- run: py.test --cov .
- env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: coveralls
+ run: coveralls --service=github
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce5eb884..b916ee1d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### Fixed
+- flattening: Uses much less memory by storing data in a embedded ZODB database, using ijson and using write only mode in pyopenxl.
- use-titles: Use $ref'erring title if available https://github.com/OpenDataServices/flatten-tool/pull/368
- create-template --no-deprecated-fields: Did not work if deprecated element at same level as a $ref https://github.com/OpenDataServices/flatten-tool/issues/185#issuecomment-719587348
diff --git a/examples/bods/unflatten/expected/out.json b/examples/bods/unflatten/expected/out.json
index ed3c55d0..a51b1103 100644
--- a/examples/bods/unflatten/expected/out.json
+++ b/examples/bods/unflatten/expected/out.json
@@ -1,24 +1,24 @@
[
{
- "statementID": "fbfd0547-d0c6-4a00-b559-5c5e91c34f5c",
- "interests": [
- {
- "type": "shareholding",
- "interestLevel": "direct",
- "beneficialOwnershipOrControl": true,
- "startDate": "2016-04-06",
- "share": {
- "exact": 100
- }
+ "statementID": "fbfd0547-d0c6-4a00-b559-5c5e91c34f5c",
+ "interests": [
+ {
+ "type": "shareholding",
+ "interestLevel": "direct",
+ "beneficialOwnershipOrControl": true,
+ "startDate": "2016-04-06",
+ "share": {
+ "exact": 100
}
- ],
- "statementType": "ownershipOrControlStatement",
- "statementDate": "2017-11-18",
- "subject": {
- "describedByEntityStatement": "1dc0e987-5c57-4a1c-b3ad-61353b66a9b7"
- },
- "interestedParty": {
- "describedByPersonStatement": "019a93f1-e470-42e9-957b-03559861b2e2"
}
+ ],
+ "statementType": "ownershipOrControlStatement",
+ "statementDate": "2017-11-18",
+ "subject": {
+ "describedByEntityStatement": "1dc0e987-5c57-4a1c-b3ad-61353b66a9b7"
+ },
+ "interestedParty": {
+ "describedByPersonStatement": "019a93f1-e470-42e9-957b-03559861b2e2"
}
-]
\ No newline at end of file
+}
+]
diff --git a/examples/cafe/relationship-missing-ids/expected.json b/examples/cafe/relationship-missing-ids/expected.json
index 534569ed..c7beaa11 100644
--- a/examples/cafe/relationship-missing-ids/expected.json
+++ b/examples/cafe/relationship-missing-ids/expected.json
@@ -16,6 +16,10 @@
}
]
},
+ {
+ "name": "Vegetarian Cafe",
+ "address": "42 Town Road, Bristol"
+ },
{
"id": "CAFE-VEG",
"table": [
@@ -24,10 +28,6 @@
}
]
},
- {
- "name": "Vegetarian Cafe",
- "address": "42 Town Road, Bristol"
- },
{
"table": [
{
diff --git a/examples/iati/expected.xml b/examples/iati/expected.xml
index 3677d49a..51cd0495 100644
--- a/examples/iati/expected.xml
+++ b/examples/iati/expected.xml
@@ -1,58 +1,57 @@
-
-
-
- AA-AAA-123456789-ABC123
-
- Organisation name
-
-
- A title
-
-
- A description
-
-
-
-
-
-
-
-
-
- 10
-
-
-
-
- 20
-
-
-
- AA-AAA-123456789-ABC124
-
- Organisation name
-
-
- Another title
-
-
- Another description
-
-
-
-
-
-
-
-
-
- 30
-
-
-
-
- 40
-
-
-
+
+
+ AA-AAA-123456789-ABC123
+
+ Organisation name
+
+
+ A title
+
+
+ A description
+
+
+
+
+
+
+
+
+
+ 10
+
+
+
+
+ 20
+
+
+
+ AA-AAA-123456789-ABC124
+
+ Organisation name
+
+
+ Another title
+
+
+ Another description
+
+
+
+
+
+
+
+
+
+ 30
+
+
+
+
+ 40
+
+
+
\ No newline at end of file
diff --git a/examples/iati_multilang/expected.xml b/examples/iati_multilang/expected.xml
index e0c34da0..4949f699 100644
--- a/examples/iati_multilang/expected.xml
+++ b/examples/iati_multilang/expected.xml
@@ -1,60 +1,59 @@
-
-
-
- AA-AAA-123456789-ABC123
-
- Organisation name
-
-
- A title, with comma
- Un titre
-
-
- A description
-
-
-
-
-
-
-
-
-
- 10
-
-
-
-
- 20
-
-
-
- AA-AAA-123456789-ABC124
-
- Organisation name
-
-
- Another title; with semicolon
- Un autre titre
-
-
- Another description
-
-
-
-
-
-
-
-
-
- 30
-
-
-
-
- 40
-
-
-
+
+
+ AA-AAA-123456789-ABC123
+
+ Organisation name
+
+
+ A title, with comma
+ Un titre
+
+
+ A description
+
+
+
+
+
+
+
+
+
+ 10
+
+
+
+
+ 20
+
+
+
+ AA-AAA-123456789-ABC124
+
+ Organisation name
+
+
+ Another title; with semicolon
+ Un autre titre
+
+
+ Another description
+
+
+
+
+
+
+
+
+
+ 30
+
+
+
+
+ 40
+
+
+
\ No newline at end of file
diff --git a/examples/iati_xml_comment/expected.xml b/examples/iati_xml_comment/expected.xml
index 8d131cd9..bc8305eb 100644
--- a/examples/iati_xml_comment/expected.xml
+++ b/examples/iati_xml_comment/expected.xml
@@ -1,60 +1,59 @@
-
-
-
- AA-AAA-123456789-ABC123
-
- Organisation name
-
-
- A title, with comma
- Un titre
-
-
- A description
-
-
-
-
-
-
-
-
-
- 10
-
-
-
-
- 20
-
-
-
- AA-AAA-123456789-ABC124
-
- Organisation name
-
-
- Another title; with semicolon
- Un autre titre
-
-
- Another description
-
-
-
-
-
-
-
-
-
- 30
-
-
-
-
- 40
-
-
-
+
+
+ AA-AAA-123456789-ABC123
+
+ Organisation name
+
+
+ A title, with comma
+ Un titre
+
+
+ A description
+
+
+
+
+
+
+
+
+
+ 10
+
+
+
+
+ 20
+
+
+
+ AA-AAA-123456789-ABC124
+
+ Organisation name
+
+
+ Another title; with semicolon
+ Un autre titre
+
+
+ Another description
+
+
+
+
+
+
+
+
+
+ 30
+
+
+
+
+ 40
+
+
+
\ No newline at end of file
diff --git a/examples/receipt/source-map/expected/cell_source_map.json b/examples/receipt/source-map/expected/cell_source_map.json
index e4ac98cc..fa16d4ff 100644
--- a/examples/receipt/source-map/expected/cell_source_map.json
+++ b/examples/receipt/source-map/expected/cell_source_map.json
@@ -1,374 +1,374 @@
{
"cafe/0/id": [
- [
- "1_cafes",
- "A",
- 2,
- "id"
- ],
- [
- "2_tables",
- "A",
- 2,
- "id"
- ],
- [
- "2_tables",
- "A",
- 3,
- "id"
- ],
- [
- "2_tables",
- "A",
- 4,
- "id"
- ],
- [
- "3_dishes",
- "A",
- 2,
- "id"
- ],
- [
- "3_dishes",
- "A",
- 3,
- "id"
- ],
- [
- "3_dishes",
- "A",
- 4,
- "id"
- ]
- ],
+ [
+ "1_cafes",
+ "A",
+ 2,
+ "id"
+ ],
+ [
+ "2_tables",
+ "A",
+ 2,
+ "id"
+ ],
+ [
+ "2_tables",
+ "A",
+ 3,
+ "id"
+ ],
+ [
+ "2_tables",
+ "A",
+ 4,
+ "id"
+ ],
+ [
+ "3_dishes",
+ "A",
+ 2,
+ "id"
+ ],
+ [
+ "3_dishes",
+ "A",
+ 3,
+ "id"
+ ],
+ [
+ "3_dishes",
+ "A",
+ 4,
+ "id"
+ ]
+],
"cafe/0/name": [
- [
- "1_cafes",
- "B",
- 2,
- "name"
- ]
- ],
+ [
+ "1_cafes",
+ "B",
+ 2,
+ "name"
+ ]
+],
"cafe/0/table/0/dish/0/cost": [
- [
- "3_dishes",
- "D",
- 2,
- "table/0/dish/0/cost"
- ]
- ],
+ [
+ "3_dishes",
+ "D",
+ 2,
+ "table/0/dish/0/cost"
+ ]
+],
"cafe/0/table/0/dish/0/name": [
- [
- "3_dishes",
- "C",
- 2,
- "table/0/dish/0/name"
- ]
- ],
+ [
+ "3_dishes",
+ "C",
+ 2,
+ "table/0/dish/0/name"
+ ]
+],
"cafe/0/table/0/dish/1/cost": [
- [
- "3_dishes",
- "D",
- 3,
- "table/0/dish/0/cost"
- ]
- ],
+ [
+ "3_dishes",
+ "D",
+ 3,
+ "table/0/dish/0/cost"
+ ]
+],
"cafe/0/table/0/dish/1/name": [
- [
- "3_dishes",
- "C",
- 3,
- "table/0/dish/0/name"
- ]
- ],
+ [
+ "3_dishes",
+ "C",
+ 3,
+ "table/0/dish/0/name"
+ ]
+],
"cafe/0/table/0/id": [
- [
- "2_tables",
- "B",
- 2,
- "table/0/id"
- ],
- [
- "3_dishes",
- "B",
- 2,
- "table/0/id"
- ],
- [
- "3_dishes",
- "B",
- 3,
- "table/0/id"
- ]
- ],
+ [
+ "2_tables",
+ "B",
+ 2,
+ "table/0/id"
+ ],
+ [
+ "3_dishes",
+ "B",
+ 2,
+ "table/0/id"
+ ],
+ [
+ "3_dishes",
+ "B",
+ 3,
+ "table/0/id"
+ ]
+],
"cafe/0/table/0/number": [
- [
- "2_tables",
- "C",
- 2,
- "table/0/number"
- ]
- ],
+ [
+ "2_tables",
+ "C",
+ 2,
+ "table/0/number"
+ ]
+],
"cafe/0/table/1/id": [
- [
- "2_tables",
- "B",
- 3,
- "table/0/id"
- ]
- ],
+ [
+ "2_tables",
+ "B",
+ 3,
+ "table/0/id"
+ ]
+],
"cafe/0/table/1/number": [
- [
- "2_tables",
- "C",
- 3,
- "table/0/number"
- ]
- ],
+ [
+ "2_tables",
+ "C",
+ 3,
+ "table/0/number"
+ ]
+],
"cafe/0/table/2/dish/0/cost": [
- [
- "3_dishes",
- "D",
- 4,
- "table/0/dish/0/cost"
- ]
- ],
+ [
+ "3_dishes",
+ "D",
+ 4,
+ "table/0/dish/0/cost"
+ ]
+],
"cafe/0/table/2/dish/0/name": [
- [
- "3_dishes",
- "C",
- 4,
- "table/0/dish/0/name"
- ]
- ],
+ [
+ "3_dishes",
+ "C",
+ 4,
+ "table/0/dish/0/name"
+ ]
+],
"cafe/0/table/2/id": [
- [
- "2_tables",
- "B",
- 4,
- "table/0/id"
- ],
- [
- "3_dishes",
- "B",
- 4,
- "table/0/id"
- ]
- ],
+ [
+ "2_tables",
+ "B",
+ 4,
+ "table/0/id"
+ ],
+ [
+ "3_dishes",
+ "B",
+ 4,
+ "table/0/id"
+ ]
+],
"cafe/0/table/2/number": [
- [
- "2_tables",
- "C",
- 4,
- "table/0/number"
- ]
- ],
- "cafe/1/id": [
- [
- "1_cafes",
- "A",
- 3,
- "id"
- ],
- [
- "2_tables",
- "A",
- 5,
- "id"
- ],
- [
- "2_tables",
- "A",
- 6,
- "id"
- ],
- [
- "3_dishes",
- "A",
- 5,
- "id"
- ]
- ],
- "cafe/1/name": [
- [
- "1_cafes",
- "B",
- 3,
- "name"
- ]
+ [
+ "2_tables",
+ "C",
+ 4,
+ "table/0/number"
+ ]
+],
+ "cafe/0": [
+ [
+ "1_cafes",
+ 2
],
- "cafe/1/table/0/dish/0/cost": [
- [
- "3_dishes",
- "D",
- 5,
- "table/0/dish/0/cost"
- ]
+ [
+ "2_tables",
+ 2
],
- "cafe/1/table/0/dish/0/name": [
- [
- "3_dishes",
- "C",
- 5,
- "table/0/dish/0/name"
- ]
+ [
+ "2_tables",
+ 3
],
- "cafe/1/table/0/id": [
- [
- "2_tables",
- "B",
- 5,
- "table/0/id"
- ],
- [
- "3_dishes",
- "B",
- 5,
- "table/0/id"
- ]
+ [
+ "2_tables",
+ 4
],
- "cafe/1/table/0/number": [
- [
- "2_tables",
- "C",
- 5,
- "table/0/number"
- ]
+ [
+ "3_dishes",
+ 2
],
- "cafe/1/table/1/id": [
- [
- "2_tables",
- "B",
- 6,
- "table/0/id"
- ]
- ],
- "cafe/1/table/1/number": [
- [
- "2_tables",
- "C",
- 6,
- "table/0/number"
- ]
- ],
- "cafe/0": [
- [
- "1_cafes",
- 2
- ],
- [
- "2_tables",
- 2
- ],
- [
- "2_tables",
- 3
- ],
- [
- "2_tables",
- 4
- ],
- [
- "3_dishes",
- 2
- ],
- [
- "3_dishes",
- 3
- ],
- [
- "3_dishes",
- 4
- ]
+ [
+ "3_dishes",
+ 3
],
+ [
+ "3_dishes",
+ 4
+ ]
+],
"cafe/0/table/0/dish/0": [
- [
- "3_dishes",
- 2
- ]
- ],
+ [
+ "3_dishes",
+ 2
+ ]
+],
"cafe/0/table/0/dish/1": [
- [
- "3_dishes",
- 3
- ]
- ],
+ [
+ "3_dishes",
+ 3
+ ]
+],
"cafe/0/table/0": [
- [
- "2_tables",
- 2
- ],
- [
- "3_dishes",
- 2
- ],
- [
- "3_dishes",
- 3
- ]
+ [
+ "2_tables",
+ 2
],
- "cafe/0/table/1": [
- [
- "2_tables",
- 3
- ]
+ [
+ "3_dishes",
+ 2
],
+ [
+ "3_dishes",
+ 3
+ ]
+],
+ "cafe/0/table/1": [
+ [
+ "2_tables",
+ 3
+ ]
+],
"cafe/0/table/2/dish/0": [
- [
- "3_dishes",
- 4
- ]
- ],
+ [
+ "3_dishes",
+ 4
+ ]
+],
"cafe/0/table/2": [
- [
- "2_tables",
- 4
- ],
- [
- "3_dishes",
- 4
- ]
+ [
+ "2_tables",
+ 4
],
+ [
+ "3_dishes",
+ 4
+ ]
+],
+ "cafe/1/id": [
+ [
+ "1_cafes",
+ "A",
+ 3,
+ "id"
+ ],
+ [
+ "2_tables",
+ "A",
+ 5,
+ "id"
+ ],
+ [
+ "2_tables",
+ "A",
+ 6,
+ "id"
+ ],
+ [
+ "3_dishes",
+ "A",
+ 5,
+ "id"
+ ]
+],
+ "cafe/1/name": [
+ [
+ "1_cafes",
+ "B",
+ 3,
+ "name"
+ ]
+],
+ "cafe/1/table/0/dish/0/cost": [
+ [
+ "3_dishes",
+ "D",
+ 5,
+ "table/0/dish/0/cost"
+ ]
+],
+ "cafe/1/table/0/dish/0/name": [
+ [
+ "3_dishes",
+ "C",
+ 5,
+ "table/0/dish/0/name"
+ ]
+],
+ "cafe/1/table/0/id": [
+ [
+ "2_tables",
+ "B",
+ 5,
+ "table/0/id"
+ ],
+ [
+ "3_dishes",
+ "B",
+ 5,
+ "table/0/id"
+ ]
+],
+ "cafe/1/table/0/number": [
+ [
+ "2_tables",
+ "C",
+ 5,
+ "table/0/number"
+ ]
+],
+ "cafe/1/table/1/id": [
+ [
+ "2_tables",
+ "B",
+ 6,
+ "table/0/id"
+ ]
+],
+ "cafe/1/table/1/number": [
+ [
+ "2_tables",
+ "C",
+ 6,
+ "table/0/number"
+ ]
+],
"cafe/1": [
- [
- "1_cafes",
- 3
- ],
- [
- "2_tables",
- 5
- ],
- [
- "2_tables",
- 6
- ],
- [
- "3_dishes",
- 5
- ]
+ [
+ "1_cafes",
+ 3
],
- "cafe/1/table/0/dish/0": [
- [
- "3_dishes",
- 5
- ]
+ [
+ "2_tables",
+ 5
+ ],
+ [
+ "2_tables",
+ 6
],
+ [
+ "3_dishes",
+ 5
+ ]
+],
+ "cafe/1/table/0/dish/0": [
+ [
+ "3_dishes",
+ 5
+ ]
+],
"cafe/1/table/0": [
- [
- "2_tables",
- 5
- ],
- [
- "3_dishes",
- 5
- ]
+ [
+ "2_tables",
+ 5
],
+ [
+ "3_dishes",
+ 5
+ ]
+],
"cafe/1/table/1": [
- [
- "2_tables",
- 6
- ]
+ [
+ "2_tables",
+ 6
]
+]
}
\ No newline at end of file
diff --git a/flatten-tool b/flatten-tool
index 328a7b7d..df6fda33 100755
--- a/flatten-tool
+++ b/flatten-tool
@@ -1,3 +1,4 @@
#!/usr/bin/env python
import flattentool.cli
+
flattentool.cli.main()
diff --git a/flattentool/__init__.py b/flattentool/__init__.py
index 5c4f4bbf..4486fedb 100644
--- a/flattentool/__init__.py
+++ b/flattentool/__init__.py
@@ -1,16 +1,24 @@
import codecs
import json
+import os
import sys
+import tempfile
+import uuid
from collections import OrderedDict
from decimal import Decimal
+import jsonstreams
+import lxml.etree
+import zc.zlibstorage
+import ZODB.FileStorage
+
from flattentool.input import FORMATS as INPUT_FORMATS
from flattentool.json_input import JSONParser
from flattentool.lib import parse_sheet_configuration
from flattentool.output import FORMATS as OUTPUT_FORMATS
from flattentool.output import FORMATS_SUFFIX
from flattentool.schema import SchemaParser
-from flattentool.xml_output import toxml
+from flattentool.xml_output import generate_schema_dict, write_comment, xml_item
def create_template(
@@ -112,7 +120,7 @@ def flatten(
else:
schema_parser = None
- parser = JSONParser(
+ with JSONParser(
json_filename=input_name,
root_list_path=None if root_is_list else root_list_path,
schema_parser=schema_parser,
@@ -126,33 +134,33 @@ def flatten(
preserve_fields=preserve_fields,
remove_empty_schema_columns=remove_empty_schema_columns,
truncation_length=truncation_length,
- )
- parser.parse()
-
- def spreadsheet_output(spreadsheet_output_class, name):
- spreadsheet_output = spreadsheet_output_class(
- parser=parser,
- main_sheet_name=main_sheet_name,
- output_name=name,
- sheet_prefix=sheet_prefix,
- )
- spreadsheet_output.write_sheets()
-
- if output_format == "all":
- if not output_name:
- output_name = "flattened"
- for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items():
- spreadsheet_output(
- spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name]
+ persist=True,
+ ) as parser:
+
+ def spreadsheet_output(spreadsheet_output_class, name):
+ spreadsheet_output = spreadsheet_output_class(
+ parser=parser,
+ main_sheet_name=main_sheet_name,
+ output_name=name,
+ sheet_prefix=sheet_prefix,
)
+ spreadsheet_output.write_sheets()
+
+ if output_format == "all":
+ if not output_name:
+ output_name = "flattened"
+ for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items():
+ spreadsheet_output(
+ spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name]
+ )
- elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats
- if not output_name:
- output_name = "flattened" + FORMATS_SUFFIX[output_format]
- spreadsheet_output(OUTPUT_FORMATS[output_format], output_name)
+ elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats
+ if not output_name:
+ output_name = "flattened" + FORMATS_SUFFIX[output_format]
+ spreadsheet_output(OUTPUT_FORMATS[output_format], output_name)
- else:
- raise Exception("The requested format is not available")
+ else:
+ raise Exception("The requested format is not available")
# From http://bugs.python.org/issue16535
@@ -179,7 +187,103 @@ def decimal_default(o):
raise TypeError(repr(o) + " is not JSON serializable")
+# This is to just to make ensure_ascii and default are correct for streaming library
+class CustomJSONEncoder(json.JSONEncoder):
+ def __init__(self, **kw):
+ super().__init__(**kw)
+ # overwrie these no matter the input to __init__
+ self.ensure_ascii = False
+ self.default = decimal_default
+
+
+def get_output(output_name, xml=False):
+ if not output_name:
+ if xml:
+ return sys.stdout.buffer
+ else:
+ return sys.stdout
+ if xml:
+ return codecs.open(output_name, "wb")
+ return codecs.open(output_name, "w", encoding="utf-8")
+
+
def unflatten(
+ input_name,
+ output_name=None,
+ cell_source_map=None,
+ root_is_list=False,
+ xml=False,
+ **kw
+):
+ unflatten_kw = {
+ "output_name": output_name,
+ "cell_source_map": cell_source_map,
+ "root_is_list": root_is_list,
+ "xml": xml,
+ }
+ unflatten_kw.update(kw)
+
+ zodb_db_location = tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4())
+ zodb_storage = zc.zlibstorage.ZlibStorage(
+ ZODB.FileStorage.FileStorage(zodb_db_location)
+ )
+ db = ZODB.DB(zodb_storage)
+ unflatten_kw["db"] = db
+
+ try:
+ if xml:
+ with get_output(output_name, xml=True) as xml_file, lxml.etree.xmlfile(
+ xml_file, encoding="utf-8"
+ ) as xml_stream:
+ unflatten_kw["xml_stream"] = xml_stream
+ if cell_source_map:
+ with get_output(
+ cell_source_map
+ ) as cell_source_map_file, jsonstreams.Stream(
+ jsonstreams.Type.object,
+ fd=cell_source_map_file,
+ indent=4,
+ encoder=CustomJSONEncoder,
+ ) as cell_source_map_stream:
+ unflatten_kw["cell_source_map_stream"] = cell_source_map_stream
+ _unflatten(input_name, **unflatten_kw)
+ else:
+ _unflatten(input_name, **unflatten_kw)
+
+ else:
+ json_stream_args = {"indent": 4, "encoder": CustomJSONEncoder}
+ if root_is_list:
+ json_stream_args["jtype"] = jsonstreams.Type.array
+ else:
+ json_stream_args["jtype"] = jsonstreams.Type.object
+
+ with get_output(output_name) as json_file, jsonstreams.Stream(
+ fd=json_file, **json_stream_args
+ ) as json_stream:
+ unflatten_kw["json_stream"] = json_stream
+ if cell_source_map:
+ with get_output(
+ cell_source_map
+ ) as cell_source_map_file, jsonstreams.Stream(
+ jsonstreams.Type.object,
+ fd=cell_source_map_file,
+ indent=4,
+ encoder=CustomJSONEncoder,
+ ) as cell_source_map_stream:
+ unflatten_kw["cell_source_map_stream"] = cell_source_map_stream
+ _unflatten(input_name, **unflatten_kw)
+ else:
+ _unflatten(input_name, **unflatten_kw)
+
+ finally:
+ db.close()
+ os.remove(zodb_db_location)
+ os.remove(zodb_db_location + ".lock")
+ os.remove(zodb_db_location + ".index")
+ os.remove(zodb_db_location + ".tmp")
+
+
+def _unflatten(
input_name,
base_json=None,
input_format=None,
@@ -205,6 +309,10 @@ def unflatten(
disable_local_refs=False,
xml_comment=None,
truncation_length=3,
+ json_stream=None,
+ cell_source_map_stream=None,
+ xml_stream=None,
+ db=None,
**_
):
"""
@@ -218,20 +326,18 @@ def unflatten(
if metatab_name and base_json:
raise Exception("Not allowed to use base_json with metatab")
- if root_is_list:
- base = None
- elif base_json:
+ if not root_is_list and base_json:
with open(base_json) as fp:
base = json.load(fp, object_pairs_hook=OrderedDict)
- else:
- base = OrderedDict()
+ for key, value in base.items():
+ json_stream.write(key, value)
base_configuration = parse_sheet_configuration(
[item.strip() for item in default_configuration.split(",")]
)
- cell_source_map_data = OrderedDict()
heading_source_map_data = OrderedDict()
+ meta_result = None
if metatab_name and not root_is_list:
spreadsheet_input_class = INPUT_FORMATS[input_format]
@@ -255,7 +361,7 @@ def unflatten(
spreadsheet_input.encoding = encoding
spreadsheet_input.read_sheets()
(
- result,
+ meta_result,
cell_source_map_data_meta,
heading_source_map_data_meta,
) = spreadsheet_input.fancy_unflatten(
@@ -264,7 +370,9 @@ def unflatten(
)
for key, value in (cell_source_map_data_meta or {}).items():
## strip off meta/0/ from start of source map as actually data is at top level
- cell_source_map_data[key[7:]] = value
+ if cell_source_map_stream:
+ cell_source_map_stream.write(key[7:], value)
+
for key, value in (heading_source_map_data_meta or {}).items():
## strip off meta/ from start of source map as actually data is at top level
heading_source_map_data[key[5:]] = value
@@ -274,9 +382,6 @@ def unflatten(
spreadsheet_input.sheet_configuration.get(metatab_name, {})
)
- if result:
- base.update(result[0])
-
if root_list_path is None:
root_list_path = base_configuration.get("RootListPath", "main")
if id_name is None:
@@ -309,54 +414,85 @@ def unflatten(
spreadsheet_input.parser = parser
spreadsheet_input.encoding = encoding
spreadsheet_input.read_sheets()
- (
- result,
- cell_source_map_data_main,
- heading_source_map_data_main,
- ) = spreadsheet_input.fancy_unflatten(
- with_cell_source_map=cell_source_map,
- with_heading_source_map=heading_source_map,
- )
- cell_source_map_data.update(cell_source_map_data_main or {})
- heading_source_map_data.update(heading_source_map_data_main or {})
- if root_is_list:
- base = list(result)
- else:
- base[root_list_path] = list(result)
if xml:
xml_root_tag = base_configuration.get("XMLRootTag", "iati-activities")
- xml_output = toxml(
- base,
- xml_root_tag,
- xml_schemas=xml_schemas,
- root_list_path=root_list_path,
- xml_comment=xml_comment,
- )
- if output_name is None:
- sys.stdout.buffer.write(xml_output)
- else:
- with codecs.open(output_name, "wb") as fp:
- fp.write(xml_output)
- else:
- if output_name is None:
- print(
- json.dumps(base, indent=4, default=decimal_default, ensure_ascii=False)
- )
- else:
- with codecs.open(output_name, "w", encoding="utf-8") as fp:
- json.dump(
- base, fp, indent=4, default=decimal_default, ensure_ascii=False
- )
- if cell_source_map:
- with codecs.open(cell_source_map, "w", encoding="utf-8") as fp:
- json.dump(
- cell_source_map_data,
- fp,
- indent=4,
- default=decimal_default,
- ensure_ascii=False,
- )
+
+ if not metatab_only:
+ xml_stream.write_declaration()
+ with xml_stream.element(xml_root_tag):
+ write_comment(xml_stream, xml_comment)
+
+ for (
+ single_result,
+ cell_source_map_data_main,
+ heading_source_map_data_main,
+ ) in spreadsheet_input.unflatten_with_storage(
+ with_cell_source_map=cell_source_map,
+ with_heading_source_map=heading_source_map,
+ db=db,
+ ):
+
+ schema_dict = None
+ if xml_schemas:
+ schema_dict = generate_schema_dict(xml_schemas, root_list_path)
+
+ for item in single_result:
+ xml_item(xml_stream, item, root_list_path, schema_dict)
+
+ if cell_source_map_stream and cell_source_map_data_main:
+ for key, value in cell_source_map_data_main.items():
+ cell_source_map_stream.write(key, value)
+
+ for key, value in (heading_source_map_data_main or {}).items():
+ if key in heading_source_map_data:
+ for item in heading_source_map_data_main[key]:
+ if item not in heading_source_map_data[key]:
+ heading_source_map_data[key].append(item)
+ else:
+ heading_source_map_data[key] = heading_source_map_data_main[
+ key
+ ]
+
+ if not xml:
+ if meta_result:
+ for key, value in meta_result[0].items():
+ json_stream.write(key, value)
+
+ if not metatab_only:
+ if not root_is_list:
+ list_stream = json_stream.subarray(root_list_path)
+ else:
+ list_stream = json_stream
+
+ for (
+ single_result,
+ cell_source_map_data_main,
+ heading_source_map_data_main,
+ ) in spreadsheet_input.unflatten_with_storage(
+ with_cell_source_map=cell_source_map,
+ with_heading_source_map=heading_source_map,
+ db=db,
+ ):
+
+ if cell_source_map_stream and cell_source_map_data_main:
+ for key, value in cell_source_map_data_main.items():
+ cell_source_map_stream.write(key, value)
+
+ for item in single_result:
+ list_stream.write(item)
+
+ for key, value in (heading_source_map_data_main or {}).items():
+ if key in heading_source_map_data:
+ for item in heading_source_map_data_main[key]:
+ if item not in heading_source_map_data[key]:
+ heading_source_map_data[key].append(item)
+ else:
+ heading_source_map_data[key] = heading_source_map_data_main[key]
+
+ if not root_is_list:
+ list_stream.close()
+
if heading_source_map:
with codecs.open(heading_source_map, "w", encoding="utf-8") as fp:
json.dump(
diff --git a/flattentool/input.py b/flattentool/input.py
index 62d59451..d09d7948 100644
--- a/flattentool/input.py
+++ b/flattentool/input.py
@@ -7,14 +7,19 @@
import datetime
import os
+import uuid
from collections import OrderedDict, UserDict
from csv import DictReader
from csv import reader as csvreader
from decimal import Decimal, InvalidOperation
from warnings import warn
+import BTrees
import openpyxl
+import persistent.list
import pytz
+import transaction
+import ZODB
from openpyxl.utils.cell import _get_column_letter
from flattentool.exceptions import DataErrorWarning
@@ -258,6 +263,7 @@ def __init__(
xml=False,
base_configuration={},
use_configuration=True,
+ persist=False,
):
self.input_name = input_name
self.root_list_path = root_list_path
@@ -275,6 +281,8 @@ def __init__(
self.base_configuration = base_configuration or {}
self.sheet_configuration = {}
self.use_configuration = use_configuration
+ self.persist = persist
+ self.actual_headings = {}
def get_sub_sheets_lines(self):
for sub_sheet_name in self.sub_sheet_names:
@@ -306,13 +314,20 @@ def get_sheet_headings(self, sheet_name):
def read_sheets(self):
raise NotImplementedError
- def do_unflatten(self):
+ def do_unflatten(self, sheet_lines=None):
main_sheet_by_ocid = OrderedDict()
- sheets = list(self.get_sub_sheets_lines())
+ if sheet_lines:
+ sheets = sheet_lines.items()
+ else:
+ sheets = list(self.get_sub_sheets_lines())
for i, sheet in enumerate(sheets):
sheet_name, lines = sheet
try:
- actual_headings = self.get_sheet_headings(sheet_name)
+ # cache headings
+ actual_headings = self.actual_headings.get(sheet_name)
+ if not actual_headings:
+ actual_headings = self.get_sheet_headings(sheet_name)
+ self.actual_headings[sheet_name] = actual_headings
# If sheet is empty or too many lines have been skipped
if not actual_headings:
continue
@@ -384,7 +399,14 @@ def do_unflatten(self):
except NotImplementedError:
# The ListInput type used in the tests doesn't support getting headings.
actual_headings = None
- for j, line in enumerate(lines):
+
+ if not sheet_lines:
+ lines_generator = enumerate(lines)
+ else:
+ # when sheet lines are supplied then get sheet row numbers out of dictionary
+ lines_generator = sorted(list(lines.items()))
+
+ for j, line in lines_generator:
if all(x is None or x == "" for x in line.values()):
# if all(x == '' for x in line.values()):
continue
@@ -452,14 +474,77 @@ def unflatten(self):
result = extract_list_to_value(result)
return result
- def fancy_unflatten(self, with_cell_source_map, with_heading_source_map):
- cell_tree = self.do_unflatten()
+ def unflatten_with_storage(
+ self, with_cell_source_map, with_heading_source_map, db=None
+ ):
+
+ if not db:
+ # If None, in memory storage is used.
+ db = ZODB.DB(None)
+
+ self.connection = db.open()
+ root = self.connection.root
+
+ # Each top level object is assigned an integer. This way we preseve ordering as much as possible
+ root.object_store = BTrees.IOBTree.BTree()
+
+ # this matches the top-level id field to its index value.
+ root.object_index = BTrees.OIBTree.BTree()
+
+ index = 0
+
+ for sheet, rows in self.get_sub_sheets_lines():
+ for row_numbar, row in enumerate(rows):
+
+ ##uuid to stop clash with any key for objects with no id
+ top_level_id = row.get(self.id_name) or str(uuid.uuid4())
+
+ current_index = root.object_index.get(top_level_id)
+
+ if current_index is None:
+ current_index = index
+ root.object_index[top_level_id] = current_index
+ root.object_store[current_index] = persistent.list.PersistentList()
+ index += 1
+
+ root.object_store[current_index].append((sheet, row_numbar, row))
+
+ if row_numbar != 0 and row_numbar % 1000 == 0:
+ transaction.commit()
+ self.connection.cacheMinimize()
+
+ transaction.commit()
+
+ self.connection.cacheMinimize()
+
+ for current_index, row_list in root.object_store.items():
+ sheet_lines = OrderedDict()
+ for sheet, row_numbar, row in row_list:
+ if sheet not in sheet_lines:
+ sheet_lines[sheet] = OrderedDict()
+ sheet_lines[sheet][row_numbar] = row
+ yield self.fancy_unflatten(
+ with_cell_source_map, with_cell_source_map, sheet_lines, current_index
+ )
+
+ self.connection.cacheMinimize()
+
+ def fancy_unflatten(
+ self,
+ with_cell_source_map,
+ with_heading_source_map,
+ sheet_lines=None,
+ index=None,
+ ):
+ cell_tree = self.do_unflatten(sheet_lines=sheet_lines)
result = extract_list_to_value(cell_tree)
ordered_cell_source_map = None
heading_source_map = None
if with_cell_source_map or with_heading_source_map:
cell_source_map = extract_list_to_error_path(
- [] if self.root_is_list else [self.root_list_path], cell_tree
+ [] if self.root_is_list else [self.root_list_path],
+ cell_tree,
+ index=index,
)
ordered_items = sorted(cell_source_map.items())
row_source_map = OrderedDict()
@@ -502,10 +587,12 @@ def fancy_unflatten(self, with_cell_source_map, with_heading_source_map):
return result, ordered_cell_source_map, heading_source_map
-def extract_list_to_error_path(path, input):
+def extract_list_to_error_path(path, input, index=None):
output = {}
+ if index:
+ assert len(input) <= 1
for i, item in enumerate(input):
- res = extract_dict_to_error_path(path + [i], item)
+ res = extract_dict_to_error_path(path + [index or i], item)
for p in res:
assert p not in output, _("Already have key {}").format(p)
output[p] = res[p]
@@ -652,7 +739,14 @@ class BadXLSXZipFile(BadZipFile):
class XLSXInput(SpreadsheetInput):
def read_sheets(self):
try:
- self.workbook = openpyxl.load_workbook(self.input_name, data_only=True)
+ if self.vertical_orientation:
+ # read_only mode only works when reading rows not columns
+ self.workbook = openpyxl.load_workbook(self.input_name, data_only=True)
+ else:
+ self.workbook = openpyxl.load_workbook(
+ self.input_name, data_only=True, read_only=True
+ )
+
except BadZipFile as e: # noqa
# TODO when we have python3 only add 'from e' to show exception chain
raise BadXLSXZipFile(
diff --git a/flattentool/json_input.py b/flattentool/json_input.py
index fa9634d8..79567c0e 100644
--- a/flattentool/json_input.py
+++ b/flattentool/json_input.py
@@ -7,18 +7,24 @@
import codecs
import copy
-import json
import os
+import tempfile
+import uuid
from collections import OrderedDict
from decimal import Decimal
from warnings import warn
+import BTrees.OOBTree
+import ijson
+import transaction
import xmltodict
+import zc.zlibstorage
+import ZODB.FileStorage
from flattentool.i18n import _
from flattentool.input import path_search
from flattentool.schema import make_sub_sheet_name
-from flattentool.sheet import Sheet
+from flattentool.sheet import PersistentSheet
BASIC_TYPES = [str, bool, int, Decimal, type(None)]
@@ -112,9 +118,26 @@ def __init__(
remove_empty_schema_columns=False,
rollup=False,
truncation_length=3,
+ persist=False,
):
+ if persist:
+ self.zodb_db_location = (
+ tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4())
+ )
+ zodb_storage = zc.zlibstorage.ZlibStorage(
+ ZODB.FileStorage.FileStorage(self.zodb_db_location)
+ )
+ self.db = ZODB.DB(zodb_storage)
+ else:
+ # If None, in memory storage is used.
+ self.db = ZODB.DB(None)
+
+ self.connection = self.db.open()
+ root = self.connection.root
+ root.sheet_store = BTrees.OOBTree.BTree()
+
self.sub_sheets = {}
- self.main_sheet = Sheet()
+ self.main_sheet = PersistentSheet(connection=self.connection, name="")
self.root_list_path = root_list_path
self.root_id = root_id
self.use_titles = use_titles
@@ -125,9 +148,17 @@ def __init__(
self.filter_value = filter_value
self.remove_empty_schema_columns = remove_empty_schema_columns
self.seen_paths = set()
+ self.persist = persist
if schema_parser:
- self.main_sheet = copy.deepcopy(schema_parser.main_sheet)
+ self.main_sheet = PersistentSheet.from_sheet(
+ schema_parser.main_sheet, self.connection
+ )
+ for sheet_name, sheet in list(self.sub_sheets.items()):
+ self.sub_sheets[sheet_name] = PersistentSheet.from_sheet(
+ sheet, self.connection
+ )
+
self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets)
if remove_empty_schema_columns:
# Don't use columns from the schema parser
@@ -194,18 +225,13 @@ def __init__(
_("Only one of json_file or root_json_dict should be supplied")
)
- if json_filename:
- with codecs.open(json_filename, encoding="utf-8") as json_file:
- try:
- self.root_json_dict = json.load(
- json_file, object_pairs_hook=OrderedDict, parse_float=Decimal
- )
- except UnicodeError as err:
- raise BadlyFormedJSONErrorUTF8(*err.args)
- except ValueError as err:
- raise BadlyFormedJSONError(*err.args)
- else:
- self.root_json_dict = root_json_dict
+ if not json_filename:
+ if self.root_list_path is None:
+ self.root_json_list = root_json_dict
+ else:
+ self.root_json_list = path_search(
+ root_json_dict, self.root_list_path.split("/")
+ )
if preserve_fields:
# Extract fields to be preserved from input file (one path per line)
@@ -240,19 +266,37 @@ def __init__(
self.preserve_fields = None
self.preserve_fields_input = None
+ if json_filename:
+ if self.root_list_path is None:
+ path = "item"
+ else:
+ path = root_list_path.replace("/", ".") + ".item"
+
+ json_file = codecs.open(json_filename, encoding="utf-8")
+
+ self.root_json_list = ijson.items(json_file, path, map_type=OrderedDict)
+
+ try:
+ self.parse()
+ except ijson.common.IncompleteJSONError as err:
+ raise BadlyFormedJSONError(*err.args)
+ except UnicodeDecodeError as err:
+ raise BadlyFormedJSONErrorUTF8(*err.args)
+ finally:
+ if json_filename:
+ json_file.close()
+
def parse(self):
- if self.root_list_path is None:
- root_json_list = self.root_json_dict
- else:
- root_json_list = path_search(
- self.root_json_dict, self.root_list_path.split("/")
- )
- for json_dict in root_json_list:
+ for num, json_dict in enumerate(self.root_json_list):
if json_dict is None:
# This is particularly useful for IATI XML, in order to not
# fall over on empty activity, e.g.
continue
self.parse_json_dict(json_dict, sheet=self.main_sheet)
+ if num % 2000 == 0 and num != 0:
+ transaction.commit()
+
+ transaction.commit()
if self.remove_empty_schema_columns:
# Remove sheets with no lines of data
@@ -501,7 +545,9 @@ def parse_json_dict(
parent_name, key, truncation_length=self.truncation_length
)
if sub_sheet_name not in self.sub_sheets:
- self.sub_sheets[sub_sheet_name] = Sheet(name=sub_sheet_name)
+ self.sub_sheets[sub_sheet_name] = PersistentSheet(
+ name=sub_sheet_name, connection=self.connection
+ )
for json_dict in value:
if json_dict is None:
@@ -518,4 +564,16 @@ def parse_json_dict(
raise ValueError(_("Unsupported type {}").format(type(value)))
if top:
- sheet.lines.append(flattened_dict)
+ sheet.append_line(flattened_dict)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ if self.persist:
+ self.connection.close()
+ self.db.close()
+ os.remove(self.zodb_db_location)
+ os.remove(self.zodb_db_location + ".lock")
+ os.remove(self.zodb_db_location + ".index")
+ os.remove(self.zodb_db_location + ".tmp")
diff --git a/flattentool/output.py b/flattentool/output.py
index b92b0d02..947ceac6 100644
--- a/flattentool/output.py
+++ b/flattentool/output.py
@@ -50,7 +50,7 @@ def close(self):
class XLSXOutput(SpreadsheetOutput):
def open(self):
- self.workbook = openpyxl.Workbook()
+ self.workbook = openpyxl.Workbook(write_only=True)
def write_sheet(self, sheet_name, sheet):
sheet_header = list(sheet)
@@ -75,7 +75,6 @@ def write_sheet(self, sheet_name, sheet):
worksheet.append(line)
def close(self):
- self.workbook.remove(self.workbook.active)
self.workbook.save(self.output_name)
diff --git a/flattentool/sheet.py b/flattentool/sheet.py
index 05f2159a..df6b99be 100644
--- a/flattentool/sheet.py
+++ b/flattentool/sheet.py
@@ -1,3 +1,8 @@
+import copy
+
+import BTrees.IOBTree
+
+
class Sheet(object):
"""
An abstract representation of a single sheet of a spreadsheet.
@@ -8,10 +13,14 @@ def __init__(self, columns=None, root_id="", name=None):
self.id_columns = []
self.columns = columns if columns else []
self.titles = {}
- self.lines = []
+ self._lines = []
self.root_id = root_id
self.name = name
+ @property
+ def lines(self):
+ return self._lines
+
def add_field(self, field, id_field=False):
columns = self.id_columns if id_field else self.columns
if field not in columns:
@@ -27,3 +36,39 @@ def __iter__(self):
yield column
for column in self.columns:
yield column
+
+ def append_line(self, flattened_dict):
+ self._lines.append(flattened_dict)
+
+
+class PersistentSheet(Sheet):
+ """
+ A sheet that is persisted in ZODB database.
+
+ """
+
+ def __init__(self, columns=None, root_id="", name=None, connection=None):
+ super().__init__(columns=columns, root_id=root_id, name=name)
+ self.connection = connection
+ self.index = 0
+ connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree()
+
+ @property
+ def lines(self):
+ for key, value in self.connection.root.sheet_store[self.name].items():
+ if key % 5000 == 0:
+ self.connection.cacheMinimize()
+ yield value
+
+ def append_line(self, flattened_dict):
+ self.connection.root.sheet_store[self.name][self.index] = flattened_dict
+ self.index += 1
+
+ @classmethod
+ def from_sheet(cls, sheet, connection):
+ instance = cls(name=sheet.name, connection=connection)
+ instance.id_columns = copy.deepcopy(sheet.id_columns)
+ instance.columns = copy.deepcopy(sheet.columns)
+ instance.titles = copy.deepcopy(sheet.titles)
+ instance.root_id = sheet.root_id
+ return instance
diff --git a/flattentool/tests/fixtures/iati-org.xml b/flattentool/tests/fixtures/iati-org.xml
index 5b181d0e..2ff00249 100644
--- a/flattentool/tests/fixtures/iati-org.xml
+++ b/flattentool/tests/fixtures/iati-org.xml
@@ -1,8 +1,7 @@
-
-
-
- AA-AAA-123456789
-
-
-
+
+
+ AA-AAA-123456789
+
+
+
\ No newline at end of file
diff --git a/flattentool/tests/test_docs.py b/flattentool/tests/test_docs.py
index 86d156a7..a98c4191 100644
--- a/flattentool/tests/test_docs.py
+++ b/flattentool/tests/test_docs.py
@@ -151,7 +151,7 @@ def _simplify_line(line):
def _strip(output):
# Don't worry about any extra blank lines at the end either
outstr = str(output, "utf8").rstrip("\n")
- return "\n".join(line.rstrip(" ") for line in outstr.split("\n"))
+ return "\n".join(line.strip() for line in outstr.split("\n"))
# Useful for a coverage check - see developer docs for how to run the check
diff --git a/flattentool/tests/test_init.py b/flattentool/tests/test_init.py
index 511404db..124f12eb 100644
--- a/flattentool/tests/test_init.py
+++ b/flattentool/tests/test_init.py
@@ -660,10 +660,8 @@ def test_unflatten(tmpdir):
]
]
}"""
- assert lines_strip_whitespace(
- tmpdir.join("cell_source_map.json").read()
- ) == lines_strip_whitespace(expected)
data = json.loads(expected)
+ assert json.loads(tmpdir.join("cell_source_map.json").read()) == data
cells, rows = original_cell_and_row_locations(data)
# Make sure every cell in the original appeared in the cell source map exactly once
assert cells == [
@@ -855,10 +853,8 @@ def test_unflatten(tmpdir):
]
]
}"""
- assert lines_strip_whitespace(
- tmpdir.join("heading_source_map.json").read()
- ) == lines_strip_whitespace(expected_headings)
heading_data = json.loads(expected_headings)
+ assert json.loads(tmpdir.join("heading_source_map.json").read()) == heading_data
headings = original_headings(heading_data)
# Make sure every heading in the original appeared in the heading source map exactly once
assert headings == [
@@ -997,7 +993,9 @@ def test_unflatten_empty(tmpdir):
tmpdir.join("release.json").read()
) == lines_strip_whitespace(
"""{
- "main": []
+ "main": [
+
+ ]
}"""
)
diff --git a/flattentool/tests/test_json_input.py b/flattentool/tests/test_json_input.py
index 738d36bd..35357863 100644
--- a/flattentool/tests/test_json_input.py
+++ b/flattentool/tests/test_json_input.py
@@ -59,30 +59,29 @@ def test_jsonparser_arguments_exceptions(tmpdir):
def test_json_filename(tmpdir):
test_json = tmpdir.join("test.json")
- test_json.write('{"a":"b"}')
+ test_json.write('[{"a":"b"}]')
parser = JSONParser(json_filename=test_json.strpath)
- assert parser.root_json_dict == {"a": "b"}
+ assert list(parser.main_sheet.lines) == [{"a": "b"}]
def test_json_filename_utf8(tmpdir):
test_json = tmpdir.join("test.json")
- test_json.write_text('{"a":"éαГ😼𝒞人"}', encoding="utf-8")
+ test_json.write_text('[{"a":"éαГ😼𝒞人"}]', encoding="utf-8")
parser = JSONParser(json_filename=test_json.strpath)
- assert parser.root_json_dict == {"a": "éαГ😼𝒞人"}
+ assert list(parser.main_sheet.lines) == [{"a": "éαГ😼𝒞人"}]
def test_json_filename_ordered(tmpdir):
test_json = tmpdir.join("test.json")
- test_json.write('{"a":"b", "c": "d"}')
+ test_json.write('[{"a":"b", "c": "d"}]')
parser = JSONParser(json_filename=test_json.strpath)
- assert list(parser.root_json_dict.items()) == [("a", "b"), ("c", "d")]
+ assert list(parser.main_sheet.lines) == [{"a": "b", "c": "d"}]
def test_parse_empty_json_dict():
parser = JSONParser(root_json_dict={})
- parser.parse()
assert list(parser.main_sheet) == []
- assert parser.main_sheet.lines == []
+ assert list(parser.main_sheet.lines) == []
assert parser.sub_sheets == {}
@@ -93,9 +92,8 @@ def test_parse_basic_json_dict():
OrderedDict([("a", "e"), ("c", "f"),]),
]
)
- parser.parse()
assert list(parser.main_sheet) == ["a", "c"]
- assert parser.main_sheet.lines == [
+ assert list(parser.main_sheet.lines) == [
{"a": "b", "c": "d"},
{"a": "e", "c": "f"},
]
@@ -106,9 +104,8 @@ def test_parse_nested_dict_json_dict():
parser = JSONParser(
root_json_dict=[OrderedDict([("a", "b"), ("c", OrderedDict([("d", "e")])),])]
)
- parser.parse()
assert list(parser.main_sheet) == ["a", "c/d"]
- assert parser.main_sheet.lines == [{"a": "b", "c/d": "e"}]
+ assert list(parser.main_sheet.lines) == [{"a": "b", "c/d": "e"}]
assert parser.sub_sheets == {}
@@ -116,9 +113,8 @@ def test_parse_nested_list_json_dict():
parser = JSONParser(
root_json_dict=[OrderedDict([("a", "b"), ("c", [OrderedDict([("d", "e")])]),])]
)
- parser.parse()
assert list(parser.main_sheet) == ["a"]
- assert parser.main_sheet.lines == [{"a": "b"}]
+ assert list(parser.main_sheet.lines) == [{"a": "b"}]
listify(parser.sub_sheets) == {"c": ["d"]}
parser.sub_sheets["c"].lines == [{"d": "e"}]
@@ -127,9 +123,8 @@ def test_parse_array():
parser = JSONParser(
root_json_dict=[OrderedDict([("testarray", ["item", "anotheritem", 42])])]
)
- parser.parse()
assert list(parser.main_sheet) == ["testarray"]
- assert parser.main_sheet.lines == [{"testarray": "item;anotheritem;42"}]
+ assert list(parser.main_sheet.lines) == [{"testarray": "item;anotheritem;42"}]
assert parser.sub_sheets == {}
@@ -138,9 +133,8 @@ def test_root_list_path():
root_json_dict={"custom_key": [OrderedDict([("a", "b"), ("c", "d"),])]},
root_list_path="custom_key",
)
- parser.parse()
assert list(parser.main_sheet) == ["a", "c"]
- assert parser.main_sheet.lines == [{"a": "b", "c": "d"}]
+ assert list(parser.main_sheet.lines) == [{"a": "b", "c": "d"}]
assert parser.sub_sheets == {}
@@ -169,11 +163,12 @@ def test_parse_ids(self):
],
root_id="ocid",
)
- parser.parse()
assert list(parser.main_sheet) == ["ocid", "id", "a", "f/g"]
- assert parser.main_sheet.lines == [{"ocid": 1, "id": 2, "a": "b", "f/g": "h"}]
+ assert list(parser.main_sheet.lines) == [
+ {"ocid": 1, "id": 2, "a": "b", "f/g": "h"}
+ ]
listify(parser.sub_sheets) == {"c": ["ocid", "id", "c/0/id", "c/0/d"]}
- assert parser.sub_sheets["c"].lines == [
+ assert list(parser.sub_sheets["c"].lines) == [
{"ocid": 1, "id": 2, "c/0/id": 3, "c/0/d": "e"},
{"ocid": 1, "id": 2, "c/0/id": 3, "c/0/d": "e2"},
]
@@ -212,9 +207,8 @@ def test_parse_ids_subsheet(self):
],
root_id="ocid",
)
- parser.parse()
assert list(parser.main_sheet) == ["ocid", "id"]
- assert parser.main_sheet.lines == [{"ocid": 1, "id": 2,}]
+ assert list(parser.main_sheet.lines) == [{"ocid": 1, "id": 2,}]
assert listify(parser.sub_sheets) == {
"testnest": [
"ocid",
@@ -225,7 +219,7 @@ def test_parse_ids_subsheet(self):
],
"tes_c": ["ocid", "id", "testnest/0/id", "testnest/0/c/0/d"],
}
- assert parser.sub_sheets["testnest"].lines == [
+ assert list(parser.sub_sheets["testnest"].lines) == [
{
"ocid": 1,
"id": 2,
@@ -234,7 +228,7 @@ def test_parse_ids_subsheet(self):
"testnest/0/f/g": "h",
},
]
- assert parser.sub_sheets["tes_c"].lines == [
+ assert list(parser.sub_sheets["tes_c"].lines) == [
{"ocid": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"},
{"ocid": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"},
]
@@ -271,15 +265,14 @@ def test_parse_ids_nested(self):
],
root_id="ocid",
)
- parser.parse()
assert list(parser.main_sheet) == ["ocid", "id", "a", "testnest/id", "f/g"]
- assert parser.main_sheet.lines == [
+ assert list(parser.main_sheet.lines) == [
{"ocid": 1, "id": 2, "a": "b", "testnest/id": 3, "f/g": "h"}
]
assert listify(parser.sub_sheets) == {
"tes_c": ["ocid", "id", "testnest/id", "testnest/c/0/d"]
}
- assert parser.sub_sheets["tes_c"].lines == [
+ assert list(parser.sub_sheets["tes_c"].lines) == [
{"ocid": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e"},
{"ocid": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"},
]
@@ -326,9 +319,8 @@ def test_sub_sheets(self, tmpdir, remove_empty_schema_columns):
schema_parser=schema_parser,
remove_empty_schema_columns=remove_empty_schema_columns,
)
- parser.parse()
assert list(parser.main_sheet) == ["a"]
- assert parser.main_sheet.lines == [{"a": "b"}]
+ assert list(parser.main_sheet.lines) == [{"a": "b"}]
assert len(parser.sub_sheets) == 2 if not remove_empty_schema_columns else 1
if not remove_empty_schema_columns:
assert list(parser.sub_sheets["c"]) == list(["ocid", "c/0/d", "c/0/f"])
@@ -352,11 +344,10 @@ def test_column_matching(self, tmpdir):
schema_parser = SchemaParser(schema_filename=test_schema.strpath)
schema_parser.parse()
parser = JSONParser(
- root_json_dict=[OrderedDict([("c", ["d"]),])], schema_parser=schema_parser
+ root_json_dict=[OrderedDict([("c", ["d"]),])], schema_parser=schema_parser,
)
- parser.parse()
assert list(parser.main_sheet) == ["c"]
- assert parser.main_sheet.lines == [{"c": "d"}]
+ assert list(parser.main_sheet.lines) == [{"c": "d"}]
assert len(parser.sub_sheets) == 0
def test_rollup(self):
@@ -390,9 +381,8 @@ def test_rollup(self):
root_id="ocid",
rollup=True,
)
- parser.parse()
assert list(parser.main_sheet) == ["testA/0/testB"]
- assert parser.main_sheet.lines == [{"testA/0/testB": "1"}]
+ assert list(parser.main_sheet.lines) == [{"testA/0/testB": "1"}]
assert len(parser.sub_sheets) == 1
assert set(parser.sub_sheets["testA"]) == set(
["ocid", "testA/0/testB", "testA/0/testC"]
@@ -438,9 +428,8 @@ def test_rollup_multiple_values(self, recwarn):
schema_parser=schema_parser,
rollup=True,
)
- parser.parse()
assert list(parser.main_sheet) == ["testA/0/testB"]
- assert parser.main_sheet.lines == [
+ assert list(parser.main_sheet.lines) == [
{
"testA/0/testB": "WARNING: More than one value supplied, consult the relevant sub-sheet for the data."
}
@@ -502,7 +491,6 @@ def test_two_parents(self):
],
schema_parser=schema_parser,
)
- parser.parse()
assert set(parser.main_sheet) == set()
assert set(parser.sub_sheets) == set(
["Atest", "Dtest", "Ate_Btest", "Dte_Btest"]
@@ -547,11 +535,12 @@ def test_parse_ids(self):
],
root_id="custom",
)
- parser.parse()
assert list(parser.main_sheet) == ["custom", "id", "a", "f/g"]
- assert parser.main_sheet.lines == [{"custom": 1, "id": 2, "a": "b", "f/g": "h"}]
+ assert list(parser.main_sheet.lines) == [
+ {"custom": 1, "id": 2, "a": "b", "f/g": "h"}
+ ]
assert listify(parser.sub_sheets) == {"c": ["custom", "id", "c/0/id", "c/0/d"]}
- assert parser.sub_sheets["c"].lines == [
+ assert list(parser.sub_sheets["c"].lines) == [
{"custom": 1, "id": 2, "c/0/id": 3, "c/0/d": "e"},
{"custom": 1, "id": 2, "c/0/id": 3, "c/0/d": "e2"},
]
@@ -590,9 +579,8 @@ def test_parse_ids_subsheet(self):
],
root_id="custom",
)
- parser.parse()
assert list(parser.main_sheet) == ["custom", "id"]
- assert parser.main_sheet.lines == [{"custom": 1, "id": 2,}]
+ assert list(parser.main_sheet.lines) == [{"custom": 1, "id": 2,}]
assert listify(parser.sub_sheets) == {
"testnest": [
"custom",
@@ -603,7 +591,7 @@ def test_parse_ids_subsheet(self):
],
"tes_c": ["custom", "id", "testnest/0/id", "testnest/0/c/0/d"],
}
- assert parser.sub_sheets["testnest"].lines == [
+ assert list(parser.sub_sheets["testnest"].lines) == [
{
"custom": 1,
"id": 2,
@@ -612,7 +600,7 @@ def test_parse_ids_subsheet(self):
"testnest/0/f/g": "h",
},
]
- assert parser.sub_sheets["tes_c"].lines == [
+ assert list(parser.sub_sheets["tes_c"].lines) == [
{"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"},
{"custom": 1, "id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"},
]
@@ -649,15 +637,14 @@ def test_parse_ids_nested(self):
],
root_id="custom",
)
- parser.parse()
assert list(parser.main_sheet) == ["custom", "id", "a", "testnest/id", "f/g"]
- assert parser.main_sheet.lines == [
+ assert list(parser.main_sheet.lines) == [
{"custom": 1, "id": 2, "a": "b", "testnest/id": 3, "f/g": "h"}
]
assert listify(parser.sub_sheets) == {
"tes_c": ["custom", "id", "testnest/id", "testnest/c/0/d"]
}
- assert parser.sub_sheets["tes_c"].lines == [
+ assert list(parser.sub_sheets["tes_c"].lines) == [
{"custom": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e"},
{"custom": 1, "id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"},
]
@@ -687,11 +674,10 @@ def test_parse_ids(self):
],
root_id="",
)
- parser.parse()
assert list(parser.main_sheet) == ["id", "a", "f/g"]
- assert parser.main_sheet.lines == [{"id": 2, "a": "b", "f/g": "h"}]
+ assert list(parser.main_sheet.lines) == [{"id": 2, "a": "b", "f/g": "h"}]
assert listify(parser.sub_sheets) == {"c": ["id", "c/0/id", "c/0/d"]}
- assert parser.sub_sheets["c"].lines == [
+ assert list(parser.sub_sheets["c"].lines) == [
{"id": 2, "c/0/id": 3, "c/0/d": "e"},
{"id": 2, "c/0/id": 3, "c/0/d": "e2"},
]
@@ -729,17 +715,16 @@ def test_parse_ids_subsheet(self):
],
root_id="",
)
- parser.parse()
assert list(parser.main_sheet) == ["id"]
- assert parser.main_sheet.lines == [{"id": 2,}]
+ assert list(parser.main_sheet.lines) == [{"id": 2,}]
assert listify(parser.sub_sheets) == {
"testnest": ["id", "testnest/0/id", "testnest/0/a", "testnest/0/f/g"],
"tes_c": ["id", "testnest/0/id", "testnest/0/c/0/d"],
}
- assert parser.sub_sheets["testnest"].lines == [
+ assert list(parser.sub_sheets["testnest"].lines) == [
{"id": 2, "testnest/0/id": 3, "testnest/0/a": "b", "testnest/0/f/g": "h",},
]
- assert parser.sub_sheets["tes_c"].lines == [
+ assert list(parser.sub_sheets["tes_c"].lines) == [
{"id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e"},
{"id": 2, "testnest/0/id": 3, "testnest/0/c/0/d": "e2"},
]
@@ -775,15 +760,14 @@ def test_parse_ids_nested(self):
],
root_id="",
)
- parser.parse()
assert list(parser.main_sheet) == ["id", "a", "testnest/id", "f/g"]
- assert parser.main_sheet.lines == [
+ assert list(parser.main_sheet.lines) == [
{"id": 2, "a": "b", "testnest/id": 3, "f/g": "h"}
]
assert listify(parser.sub_sheets) == {
"tes_c": ["id", "testnest/id", "testnest/c/0/d"]
}
- assert parser.sub_sheets["tes_c"].lines == [
+ assert list(parser.sub_sheets["tes_c"].lines) == [
{"id": 2, "testnest/id": 3, "testnest/c/0/d": "e"},
{"id": 2, "testnest/id": 3, "testnest/c/0/d": "e2"},
]
diff --git a/flattentool/tests/test_json_input_is_unflatten_reversed.py b/flattentool/tests/test_json_input_is_unflatten_reversed.py
index cdd6a9a5..3007e2e2 100644
--- a/flattentool/tests/test_json_input_is_unflatten_reversed.py
+++ b/flattentool/tests/test_json_input_is_unflatten_reversed.py
@@ -80,7 +80,6 @@ def test_flatten(
schema_parser=schema_parser,
**extra_kwargs
)
- parser.parse()
expected_output_list = [
inject_root_id(root_id, expected_output_dict)
@@ -188,7 +187,6 @@ def test_flatten_multiplesheets(
schema_parser=schema_parser,
**extra_kwargs
)
- parser.parse()
expected_output_dict = OrderedDict(
[
@@ -197,11 +195,11 @@ def test_flatten_multiplesheets(
]
)
output = {
- sheet_name: sheet.lines
+ sheet_name: list(sheet.lines)
for sheet_name, sheet in parser.sub_sheets.items()
- if sheet.lines
+ if list(sheet.lines)
}
- output["custom_main"] = parser.main_sheet.lines
+ output["custom_main"] = list(parser.main_sheet.lines)
assert output == expected_output_dict
diff --git a/flattentool/tests/test_output.py b/flattentool/tests/test_output.py
index 023ce09b..ea47407b 100644
--- a/flattentool/tests/test_output.py
+++ b/flattentool/tests/test_output.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import os
+import sys
import openpyxl
import pytest
@@ -41,7 +42,10 @@ def test_blank_sheets(tmpdir):
wb = openpyxl.load_workbook(tmpdir.join("release.xlsx").strpath)
assert wb.sheetnames == ["release"]
rows = list(wb["release"].rows)
- assert len(rows) == 0
+ # openpyxl fixed this bug but earler versions of python are stuck with it.
+ # remove when we no longer support 3.5
+ if sys.version_info >= (3, 6, 0):
+ assert len(rows) == 0
# Check CSV is Empty
assert tmpdir.join("release").listdir() == [
@@ -102,7 +106,7 @@ def test_empty_lines(tmpdir):
subsheet = Sheet(root_id="ocid")
subsheet.add_field("c")
parser = MockParser(["a", "d"], {"b": subsheet})
- parser.main_sheet.lines = []
+ parser.main_sheet._lines = []
for format_name, spreadsheet_output_class in output.FORMATS.items():
spreadsheet_output = spreadsheet_output_class(
parser=parser,
@@ -147,8 +151,8 @@ def test_populated_lines(tmpdir):
subsheet = Sheet(root_id="ocid")
subsheet.add_field("c")
parser = MockParser(["a"], {})
- parser.main_sheet.lines = [{"a": "cell1"}, {"a": "cell2"}]
- subsheet.lines = [{"c": "cell3"}, {"c": "cell4"}]
+ parser.main_sheet._lines = [{"a": "cell1"}, {"a": "cell2"}]
+ subsheet._lines = [{"c": "cell3"}, {"c": "cell4"}]
parser.sub_sheets["b"] = subsheet
for format_name, spreadsheet_output_class in output.FORMATS.items():
spreadsheet_output = spreadsheet_output_class(
@@ -206,7 +210,7 @@ def test_populated_lines(tmpdir):
def test_utf8(tmpdir):
parser = MockParser(["é"], {})
- parser.main_sheet.lines = [{"é": "éαГ😼𝒞人"}, {"é": "cell2"}]
+ parser.main_sheet._lines = [{"é": "éαГ😼𝒞人"}, {"é": "cell2"}]
for format_name, spreadsheet_output_class in output.FORMATS.items():
spreadsheet_output = spreadsheet_output_class(
parser=parser,
diff --git a/flattentool/tests/test_xml_input.py b/flattentool/tests/test_xml_input.py
index 4ab90784..d0539749 100644
--- a/flattentool/tests/test_xml_input.py
+++ b/flattentool/tests/test_xml_input.py
@@ -15,9 +15,8 @@ def test_xml_empty():
xml=True,
id_name="iati-identifier",
)
- parser.parse()
assert list(parser.main_sheet) == []
- assert parser.main_sheet.lines == []
+ assert list(parser.main_sheet.lines) == []
assert parser.sub_sheets == {}
@@ -30,7 +29,6 @@ def test_xml_basic_example():
xml=True,
id_name="iati-identifier",
)
- parser.parse()
assert list(parser.main_sheet) == [
"iati-identifier",
"reporting-org/@ref",
@@ -44,7 +42,7 @@ def test_xml_basic_example():
"activity-date/@iso-date",
"activity-date/@type",
]
- assert parser.main_sheet.lines == [
+ assert list(parser.main_sheet.lines) == [
{
"activity-date/@type": "1",
"reporting-org/narrative": "Organisation name",
@@ -80,7 +78,7 @@ def test_xml_basic_example():
"transaction/0/value/@value-date",
"transaction/0/value",
]
- assert parser.sub_sheets["transaction"].lines == [
+ assert list(parser.sub_sheets["transaction"].lines) == [
{
"transaction/0/value/@value-date": "2012-01-01",
"iati-identifier": "AA-AAA-123456789-ABC123",
@@ -115,7 +113,7 @@ def test_xml_basic_example():
"recipient-country/0/@code",
"recipient-country/0/@percentage",
]
- assert parser.sub_sheets["recipient-country"].lines == [
+ assert list(parser.sub_sheets["recipient-country"].lines) == [
{
"iati-identifier": "AA-AAA-123456789-ABC123",
"recipient-country/0/@code": "AF",
@@ -148,9 +146,8 @@ def test_varyin_transaction_count():
xml=True,
id_name="iati-identifier",
)
- parser.parse()
assert list(parser.main_sheet) == ["iati-identifier"]
- assert parser.main_sheet.lines == [
+ assert list(parser.main_sheet.lines) == [
{"iati-identifier": "AA-AAA-123456789-ABC123"},
{"iati-identifier": "AA-AAA-123456789-ABC124"},
{"iati-identifier": "AA-AAA-123456789-ABC125"},
@@ -162,7 +159,7 @@ def test_varyin_transaction_count():
"transaction/0/value/@value-date",
"transaction/0/value",
]
- assert parser.sub_sheets["transaction"].lines == [
+ assert list(parser.sub_sheets["transaction"].lines) == [
{
"iati-identifier": "AA-AAA-123456789-ABC123",
"transaction/0/value/@value-date": "2012-01-01",
@@ -251,16 +248,15 @@ def test_list_dict_consistency():
def test_xml_whitespace():
- parser = JSONParser(
- json_filename="flattentool/tests/fixtures/narrative_whitespace.xml",
- root_list_path="iati-activity",
- schema_parser=None,
- root_id="",
- xml=True,
- id_name="iati-identifier",
- )
-
try:
- parser.parse()
+ parser = JSONParser(
+ json_filename="flattentool/tests/fixtures/narrative_whitespace.xml",
+ root_list_path="iati-activity",
+ schema_parser=None,
+ root_id="",
+ xml=True,
+ id_name="iati-identifier",
+ )
+ assert parser
except TypeError as e:
raise e
diff --git a/flattentool/xml_output.py b/flattentool/xml_output.py
index a689d14c..5ed788ca 100644
--- a/flattentool/xml_output.py
+++ b/flattentool/xml_output.py
@@ -111,3 +111,30 @@ def toxml(
)
else:
return ET.tostring(root)
+
+
+def write_comment(xml_stream, xml_comment):
+ if xml_comment is None:
+ xml_comment = "XML generated by flatten-tool"
+ if xml_comment:
+ xml_stream.write(ET.Comment(xml_comment), pretty_print=True)
+
+
+def generate_schema_dict(xml_schemas, root_list_path):
+ return XMLSchemaWalker(xml_schemas).create_schema_dict(root_list_path)
+
+
+def xml_item(xml_stream, data, root_list_path="iati-activity", schema_dict=None):
+ nsmap = {
+ # This is "bound by definition" - see https://www.w3.org/XML/1998/namespace
+ "xml": "http://www.w3.org/XML/1998/namespace"
+ }
+
+ root = dict_to_xml(data, root_list_path, nsmap=nsmap)
+ if schema_dict:
+ sort_element(root, schema_dict)
+
+ xml_stream.write(root, pretty_print=True)
+ root = None
+
+ return schema_dict
diff --git a/setup.py b/setup.py
index 6379e337..24c37097 100644
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,10 @@ def run(self):
"xmltodict",
"lxml",
"odfpy",
+ "zodb",
+ "zc.zlibstorage",
+ "ijson",
+ "jsonstreams",
]
setup(