From 6d07389231a8d33d024df01151134c526de52066 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 12 May 2020 20:05:29 +0200 Subject: [PATCH 1/6] test for orderedgroup being ordered by index (still failing) --- tests/model/TEMP1_Gutachten2-2.xml | 302 +++++++++++++++++++++++++++++ tests/model/test_ocrd_page.py | 16 ++ 2 files changed, 318 insertions(+) create mode 100644 tests/model/TEMP1_Gutachten2-2.xml diff --git a/tests/model/TEMP1_Gutachten2-2.xml b/tests/model/TEMP1_Gutachten2-2.xml new file mode 100644 index 000000000..869fc0358 --- /dev/null +++ b/tests/model/TEMP1_Gutachten2-2.xml @@ -0,0 +1,302 @@ + + + + OCR-D/core 2.4.2 + 2020-02-28T18:32:40 + 2020-02-28T18:32:40 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7ffb81b85..342963094 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -139,6 +139,22 @@ def test_simple_types(self): self.assertEqual(reg.get_type(), 'page-number') self.assertTrue(isinstance(reg.get_type(), str)) + def test_orderedgroup_export_order(self): + """ + See https://github.com/OCR-D/core/issues/475 + """ + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + pcgts = parseString(f.read().encode('utf8'), silence=True) + og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() + rris = og.get_RegionRefIndexed() + self.assertEqual([rri.index for rri in rris], list(range(0, 17))) + rris = list(reversed(rris)) + self.assertEqual([rri.index for rri in rris], list(reversed(range(0, 17)))) + og.set_RegionRefIndexed(rris) + # reverse sort the RegionRefIndexeds + pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) + og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() + self.assertEqual([rri.index for rri in rris], list(range(0, 17))) if __name__ == '__main__': main() From 8bc25ee122eaa151a564f15ac3c7882f8c1e7e5a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 12 May 2020 20:25:57 +0200 Subject: [PATCH 2/6] wip: override exportChildren to sort children by @index --- .../ocrd_models/ocrd_page_generateds.py | 56 +++++++++---------- ocrd_models/ocrd_page_user_methods.py | 29 ++++++++++ 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 825c6b4b8..bc59c6367 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # -# Generated Thu Apr 30 00:28:17 2020 by generateDS.py version 2.35.21. -# Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] +# Generated Tue May 12 20:21:26 2020 by generateDS.py version 2.35.20. +# Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0] # # Command line options: # ('-f', '') @@ -16,7 +16,7 @@ # repo/assets/data/schema/data/2019.xsd # # Command line: -# /home/kba/env/py3/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd +# /home/kba/miniconda3/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd # # Current working directory (os.getcwd()): # core @@ -247,12 +247,7 @@ def gds_validate_float_list( raise_parse_error(node, 'Requires sequence of float values') return values def gds_format_decimal(self, input_data, input_name=''): - return_value = '%s' % input_data - if '.' in return_value: - return_value = return_value.rstrip('0') - if return_value.endswith('.'): - return_value = return_value.rstrip('.') - return return_value + return ('%s' % input_data).rstrip('0') def gds_parse_decimal(self, input_data, node=None, input_name=''): try: decimal_value = decimal_.Decimal(input_data) @@ -266,7 +261,7 @@ def gds_validate_decimal(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires decimal value') return value def gds_format_decimal_list(self, input_data, input_name=''): - return ' '.join([self.gds_format_decimal(item) for item in input_data]) + return '%s' % ' '.join(input_data) def gds_validate_decimal_list( self, input_data, node=None, input_name=''): values = input_data.split() @@ -6006,26 +6001,6 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): - if pretty_print: - eol_ = '\n' - else: - eol_ = '' - if self.UserDefined is not None: - namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' - self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) - for Labels_ in self.Labels: - namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' - Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - for RegionRefIndexed_ in self.RegionRefIndexed: - namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRefIndexed_nsprefix_) else '' - RegionRefIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRefIndexed', pretty_print=pretty_print) - for OrderedGroupIndexed_ in self.OrderedGroupIndexed: - namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.OrderedGroupIndexed_nsprefix_) else '' - OrderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='OrderedGroupIndexed', pretty_print=pretty_print) - for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: - namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' - UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ if SaveElementTreeNode: @@ -6100,6 +6075,27 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroupIndexed' def __hash__(self): return hash(self.id) + + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): + if pretty_print: + eol_ = '\n' + else: + eol_ = '' + if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' + self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) + for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' + Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) + for RegionRefIndexed_ in sorted(self.RegionRefIndexed, key=lambda rri: rri.index): + namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRefIndexed_nsprefix_) else '' + RegionRefIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRefIndexed', pretty_print=pretty_print) + for OrderedGroupIndexed_ in sorted(self.OrderedGroupIndexed, key=lambda ogi: ogi.index): + namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.OrderedGroupIndexed_nsprefix_) else '' + OrderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='OrderedGroupIndexed', pretty_print=pretty_print) + for UnorderedGroupIndexed_ in sorted(self.UnorderedGroupIndexed, key=lambda ugi: ugi.index): + namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' + UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) # end class OrderedGroupType diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 507b8ff39..6b9a637bc 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -83,6 +83,34 @@ def show(self): # # Replace the following method specifications with your own. +# +# export children sorted by index of the childelement +# +sort_children_by_index = MethodSpec(name='exportChildren', + source=r''' + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): + if pretty_print: + eol_ = '\n' + else: + eol_ = '' + if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' + self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) + for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' + Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) + for RegionRefIndexed_ in sorted(self.RegionRefIndexed, key=lambda rri: rri.index): + namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRefIndexed_nsprefix_) else '' + RegionRefIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRefIndexed', pretty_print=pretty_print) + for OrderedGroupIndexed_ in sorted(self.OrderedGroupIndexed, key=lambda ogi: ogi.index): + namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.OrderedGroupIndexed_nsprefix_) else '' + OrderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='OrderedGroupIndexed', pretty_print=pretty_print) + for UnorderedGroupIndexed_ in sorted(self.UnorderedGroupIndexed, key=lambda ugi: ugi.index): + namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' + UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) +''', + class_names=r'OrderedGroupType$', + ) # # Hash by memory adress/id() # @@ -99,6 +127,7 @@ def __hash__(self): # METHOD_SPECS = ( hash_by_id, + sort_children_by_index, ) From f1d289274f061ce159375d65f142bd9e62e3876e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 13 May 2020 12:35:46 +0200 Subject: [PATCH 3/6] Update tests/model/TEMP1_Gutachten2-2.xml Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- tests/model/TEMP1_Gutachten2-2.xml | 410 +++++++++++++++++++++-------- 1 file changed, 297 insertions(+), 113 deletions(-) diff --git a/tests/model/TEMP1_Gutachten2-2.xml b/tests/model/TEMP1_Gutachten2-2.xml index 869fc0358..8693a53e9 100644 --- a/tests/model/TEMP1_Gutachten2-2.xml +++ b/tests/model/TEMP1_Gutachten2-2.xml @@ -51,15 +51,41 @@ + + + + + + + + + + + + + + + + + - + - - + + + + + + + + + + + @@ -67,236 +93,394 @@ - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - - - - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + + + + + + + - + - + - + + + + + + + + + + - + - - - - - - - + - + - + - + - + - - - - + - + - + + + + - - - + + + + + + - - - + + + + + + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + From 7b5a84157a4ea51dd0eb587fc7132fc7b3922d46 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 13 May 2020 12:37:25 +0200 Subject: [PATCH 4/6] Update ocrd_models/ocrd_models/ocrd_page_generateds.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- .../ocrd_models/ocrd_page_generateds.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index bc59c6367..8c15a0820 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -6087,15 +6087,16 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - for RegionRefIndexed_ in sorted(self.RegionRefIndexed, key=lambda rri: rri.index): - namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRefIndexed_nsprefix_) else '' - RegionRefIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRefIndexed', pretty_print=pretty_print) - for OrderedGroupIndexed_ in sorted(self.OrderedGroupIndexed, key=lambda ogi: ogi.index): - namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.OrderedGroupIndexed_nsprefix_) else '' - OrderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='OrderedGroupIndexed', pretty_print=pretty_print) - for UnorderedGroupIndexed_ in sorted(self.UnorderedGroupIndexed, key=lambda ugi: ugi.index): - namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' - UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) + namespaceprefix_ = '' + if UseCapturedNS_: + if self.RegionRefIndexed_nsprefix_: + namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' + elif self.OrderedGroupIndexed_nsprefix_: + namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' + elif self.UnorderedGroupIndexed_nsprefix_: + namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' + for entry in sorted(self.RegionRefIndexed + self.OrderedGroupIndexed + self.UnorderedGroupIndexed, key=lambda rri: rri.index): + entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) # end class OrderedGroupType From bab13afa7d695123dcc8ff3b3536f1d47ddb6e37 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 13 May 2020 12:52:12 +0200 Subject: [PATCH 5/6] Update ocrd_models/ocrd_page_user_methods.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/ocrd_page_user_methods.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 6b9a637bc..e9c3ff827 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -99,15 +99,16 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - for RegionRefIndexed_ in sorted(self.RegionRefIndexed, key=lambda rri: rri.index): - namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRefIndexed_nsprefix_) else '' - RegionRefIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRefIndexed', pretty_print=pretty_print) - for OrderedGroupIndexed_ in sorted(self.OrderedGroupIndexed, key=lambda ogi: ogi.index): - namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.OrderedGroupIndexed_nsprefix_) else '' - OrderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='OrderedGroupIndexed', pretty_print=pretty_print) - for UnorderedGroupIndexed_ in sorted(self.UnorderedGroupIndexed, key=lambda ugi: ugi.index): - namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' - UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) + namespaceprefix_ = '' + if UseCapturedNS_: + if self.RegionRefIndexed_nsprefix_: + namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' + elif self.OrderedGroupIndexed_nsprefix_: + namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' + elif self.UnorderedGroupIndexed_nsprefix_: + namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' + for entry in sorted(self.RegionRefIndexed + self.OrderedGroupIndexed + self.UnorderedGroupIndexed, key=lambda rri: rri.index): +entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) ''', class_names=r'OrderedGroupType$', ) From 1bbf5fb4bda9cd1b3c98246acd72b300fa0e4ee6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 13 May 2020 16:09:14 +0200 Subject: [PATCH 6/6] Add additional methods to OrderedGroupType/OrderedGroupIdexedType to * `get_AllIndexed()`: list all RegionRefIndexed, OrderedGroupIdexed and UnorderedGroupIndexed elements, sorted ascending by their index * `clear_Allindexed()`: Removes all RegionRefIndexed, OrderedGroupIdexed and UnorderedGroupIndexed elements and reeutrns them * `add_AllIndexed(elmenets): Add a variety of RegionRefIndexed, OrderedGroupIdexed and UnorderedGroupIndexed elmenets with proper sort byt index On export of an OrderedGroup, the elements are listed with `get_AllIndexed` and hence should have the correct order. Also when exporting: all empty OrderedGroupIdexed/UnorderedGroupIndexed will be replaced with `RegionRefIndexed` with appropriate `index` nd `regionRef` --- ocrd_models/ocrd_models/ocrd_page.py | 7 +- .../ocrd_models/ocrd_page_generateds.py | 112 +++++++++++++++--- ocrd_models/ocrd_page_user_methods.py | 81 +++++++++---- tests/model/TEMP1_Gutachten2-2.xml | 4 +- tests/model/test_ocrd_page.py | 42 +++++-- 5 files changed, 197 insertions(+), 49 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page.py b/ocrd_models/ocrd_models/ocrd_page.py index d928bf060..55a2084f4 100644 --- a/ocrd_models/ocrd_models/ocrd_page.py +++ b/ocrd_models/ocrd_models/ocrd_page.py @@ -21,6 +21,8 @@ 'PageType', 'PcGtsType', 'ReadingOrderType', + 'OrderedGroupIdexedType' + 'UnorderedGroupIndexedType', 'RegionRefIndexedType', 'SeparatorRegionType', 'TextEquivType', @@ -44,11 +46,10 @@ LabelsType, MathsRegionType, MetadataType, - MetadataItemType, - NoiseRegionType, - OrderedGroupType, PageType, PcGtsType, + OrderedGroupIndexedType, + UnorderedGroupIndexedType, ReadingOrderType, RegionRefIndexedType, SeparatorRegionType, diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 8c15a0820..941851c35 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Tue May 12 20:21:26 2020 by generateDS.py version 2.35.20. +# Generated Wed May 13 16:09:07 2020 by generateDS.py version 2.35.20. # Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0] # # Command line options: @@ -5347,6 +5347,47 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroupIndexed' def __hash__(self): return hash(self.id) + + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): + eol_ = '\n' if pretty_print else '' + namespaceprefix_ = 'pc:' + if self.UserDefined is not None: + self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) + for Labels_ in self.Labels: + Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) + cleaned = [] + # remove emtpy groups and replace with RegionRefIndexedType + for entry in self.get_AllIndexed(): + if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): + rri = RegionRefIndexedType.factory(parent_object_=self) + rri.index = entry.index + rri.regionRef = entry.regionRef + cleaned.append(rri) + else: + cleaned.append(entry) + for entry in cleaned: + entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + + def get_AllIndexed(self): + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + def add_AllIndexed(self, elements): + if not isinstance(elements, list): + elements = [elements] + for element in sorted(elements, key=lambda x : x.index): + if isinstance(element, RegionRefIndexedType): + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + + def clear_AllIndexed(self): + ret = self.get_AllIndexed() + self.set_RegionRefIndexed([]) + self.set_OrderedGroupIndexed([]) + self.set_UnorderedGroupIndexed([]) + return ret # end class OrderedGroupIndexedType @@ -6001,6 +6042,26 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): + if pretty_print: + eol_ = '\n' + else: + eol_ = '' + if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' + self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) + for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' + Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) + for RegionRefIndexed_ in self.RegionRefIndexed: + namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRefIndexed_nsprefix_) else '' + RegionRefIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRefIndexed', pretty_print=pretty_print) + for OrderedGroupIndexed_ in self.OrderedGroupIndexed: + namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.OrderedGroupIndexed_nsprefix_) else '' + OrderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='OrderedGroupIndexed', pretty_print=pretty_print) + for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: + namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' + UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ if SaveElementTreeNode: @@ -6077,26 +6138,45 @@ def __hash__(self): return hash(self.id) def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): - if pretty_print: - eol_ = '\n' - else: - eol_ = '' + eol_ = '\n' if pretty_print else '' + namespaceprefix_ = 'pc:' if self.UserDefined is not None: - namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: - namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - namespaceprefix_ = '' - if UseCapturedNS_: - if self.RegionRefIndexed_nsprefix_: - namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' - elif self.OrderedGroupIndexed_nsprefix_: - namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' - elif self.UnorderedGroupIndexed_nsprefix_: - namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' - for entry in sorted(self.RegionRefIndexed + self.OrderedGroupIndexed + self.UnorderedGroupIndexed, key=lambda rri: rri.index): + cleaned = [] + # remove emtpy groups and replace with RegionRefIndexedType + for entry in self.get_AllIndexed(): + if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): + rri = RegionRefIndexedType.factory(parent_object_=self) + rri.index = entry.index + rri.regionRef = entry.regionRef + cleaned.append(rri) + else: + cleaned.append(entry) + for entry in cleaned: entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + + def get_AllIndexed(self): + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + def add_AllIndexed(self, elements): + if not isinstance(elements, list): + elements = [elements] + for element in sorted(elements, key=lambda x : x.index): + if isinstance(element, RegionRefIndexedType): + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + + def clear_AllIndexed(self): + ret = self.get_AllIndexed() + self.set_RegionRefIndexed([]) + self.set_OrderedGroupIndexed([]) + self.set_UnorderedGroupIndexed([]) + return ret # end class OrderedGroupType diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index e9c3ff827..927c2298b 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -83,35 +83,71 @@ def show(self): # # Replace the following method specifications with your own. +# +# List all *Indexed children sorted by @index +# +get_AllIndexed = MethodSpec(name='get_AllIndexed', + source=r''' + def get_AllIndexed(self): + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) ''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') + +# +# Clear all *Indexed children sorted by @index +# +clear_AllIndexed = MethodSpec(name='clear_AllIndexed', + source=r''' + def clear_AllIndexed(self): + ret = self.get_AllIndexed() + self.set_RegionRefIndexed([]) + self.set_OrderedGroupIndexed([]) + self.set_UnorderedGroupIndexed([]) + return ret +''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') + +# +# Add all *Indexed children sorted by @index +# +add_AllIndexed = MethodSpec(name='add_AllIndexed', + source=r''' + def add_AllIndexed(self, elements): + if not isinstance(elements, list): + elements = [elements] + for element in sorted(elements, key=lambda x : x.index): + if isinstance(element, RegionRefIndexedType): + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() +''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') + + # # export children sorted by index of the childelement # -sort_children_by_index = MethodSpec(name='exportChildren', +exportChildren = MethodSpec(name='exportChildren', source=r''' def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): - if pretty_print: - eol_ = '\n' - else: - eol_ = '' + eol_ = '\n' if pretty_print else '' + namespaceprefix_ = 'pc:' if self.UserDefined is not None: - namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: - namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - namespaceprefix_ = '' - if UseCapturedNS_: - if self.RegionRefIndexed_nsprefix_: - namespaceprefix_ = self.RegionRefIndexed_nsprefix_ + ':' - elif self.OrderedGroupIndexed_nsprefix_: - namespaceprefix_ = self.OrderedGroupIndexed_nsprefix_ + ':' - elif self.UnorderedGroupIndexed_nsprefix_: - namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' - for entry in sorted(self.RegionRefIndexed + self.OrderedGroupIndexed + self.UnorderedGroupIndexed, key=lambda rri: rri.index): -entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) -''', - class_names=r'OrderedGroupType$', - ) + cleaned = [] + # remove emtpy groups and replace with RegionRefIndexedType + for entry in self.get_AllIndexed(): + if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): + rri = RegionRefIndexedType.factory(parent_object_=self) + rri.index = entry.index + rri.regionRef = entry.regionRef + cleaned.append(rri) + else: + cleaned.append(entry) + for entry in cleaned: + entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) +''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') # # Hash by memory adress/id() # @@ -128,7 +164,10 @@ def __hash__(self): # METHOD_SPECS = ( hash_by_id, - sort_children_by_index, + exportChildren, + get_AllIndexed, + add_AllIndexed, + clear_AllIndexed, ) diff --git a/tests/model/TEMP1_Gutachten2-2.xml b/tests/model/TEMP1_Gutachten2-2.xml index 8693a53e9..f0d7cbd33 100644 --- a/tests/model/TEMP1_Gutachten2-2.xml +++ b/tests/model/TEMP1_Gutachten2-2.xml @@ -75,7 +75,7 @@ - + @@ -85,7 +85,7 @@ - + diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 342963094..978d2a47e 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -7,6 +7,8 @@ PageType, TextRegionType, TextLineType, + OrderedGroupIndexedType, + RegionRefIndexedType, WordType, GlyphType, @@ -49,6 +51,7 @@ class TestOcrdPage(TestCase): def setUp(self): + self.maxDiff = 5000 with open(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'), 'rb') as f: self.xml_as_str = f.read() self.pcgts = parseString(self.xml_as_str, silence=True) @@ -146,15 +149,40 @@ def test_orderedgroup_export_order(self): with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() - rris = og.get_RegionRefIndexed() - self.assertEqual([rri.index for rri in rris], list(range(0, 17))) - rris = list(reversed(rris)) - self.assertEqual([rri.index for rri in rris], list(reversed(range(0, 17)))) - og.set_RegionRefIndexed(rris) - # reverse sort the RegionRefIndexeds + xml_before = to_xml(og) + children = og.get_AllIndexed() + self.assertEqual(len(children), 20) + self.assertEqual([c.index for c in children], list(range(0, 20))) + # mix up the indexes + children[0].index = 11 + children[11].index = 3 + children[3].index = 0 + self.assertEqual([c.index for c in children], [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19]) + self.assertEqual([c.index for c in og.get_AllIndexed()], list(range(0, 20))) + self.assertEqual(og.get_AllIndexed()[1].__class__, OrderedGroupIndexedType) + # serialize and make sure the correct order was serialized + new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) + new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() + self.assertEqual([c.index for c in new_og.get_AllIndexed()], list(range(0, 20))) + # xml_after = to_xml(new_og) + # self.assertEqual(xml_after, xml_before) + + def test_empty_groups_to_regionrefindexed(self): + """ + Corrolary See https://github.com/OCR-D/core/issues/475 + """ + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + pcgts = parseString(f.read().encode('utf8'), silence=True) + og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() + children = og.get_AllIndexed() + self.assertTrue(isinstance(children[1], OrderedGroupIndexedType)) + # empty all the elements in the first orederdGroupIndexed + children[1].set_RegionRefIndexed([]) + # serialize apnd parse to see empty group converted pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() - self.assertEqual([rri.index for rri in rris], list(range(0, 17))) + children = og.get_AllIndexed() + self.assertTrue(isinstance(children[1], RegionRefIndexedType)) if __name__ == '__main__': main()