From abef3522d6f3f50993ee60414649724aa007121f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 13 May 2020 20:44:25 +0200 Subject: [PATCH 01/30] PageType.get_AllRegions to list all kinds of regions --- .../ocrd_models/ocrd_page_generateds.py | 18 ++++++++++++++-- ocrd_models/ocrd_page_user_methods.py | 21 +++++++++++++++++++ tests/model/test_ocrd_page.py | 11 ++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 941851c35b..0295dc8ef0 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Wed May 13 16:09:07 2020 by generateDS.py version 2.35.20. +# Generated Wed May 13 20:41:21 2020 by generateDS.py version 2.35.20. # Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0] # # Command line options: @@ -2850,7 +2850,21 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'CustomRegion' def __hash__(self): return hash(self.id) -# end class PageType + + def get_AllRegions(self, regions=None, reading_order=False): + """" + Get all the *Region element or only those provided by ``regions``. + Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) + """ + if reading_order: + raise NotImplementedError("Ordering of regions by Reading Order not currently Implemented :(") + if not regions: + regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Table', 'Text'] + ret = [] + for region in regions: + ret += getattr(self, 'get_{}Region'.format(region))() + return ret + # end class PageType class CoordsType(GeneratedsSuper): diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 927c2298b1..8b87138692 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -83,6 +83,26 @@ def show(self): # # Replace the following method specifications with your own. +# +# List all *Regions on the PAGE +# +get_AllRegions = MethodSpec(name='get_AllRegions', + source=r''' + def get_AllRegions(self, regions=None, reading_order=False): + """" + Get all the *Region element or only those provided by ``regions``. + Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) + """ + if reading_order: + raise NotImplementedError("Ordering of regions by Reading Order not currently Implemented :(") + if not regions: + regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Table', 'Text'] + ret = [] + for region in regions: + ret += getattr(self, 'get_{}Region'.format(region))() + return ret + ''', class_names=r'^(PageType)$') + # # List all *Indexed children sorted by @index # @@ -167,6 +187,7 @@ def __hash__(self): exportChildren, get_AllIndexed, add_AllIndexed, + get_AllRegions, clear_AllIndexed, ) diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 978d2a47e5..605ee4e483 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -184,5 +184,16 @@ def test_empty_groups_to_regionrefindexed(self): children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], RegionRefIndexedType)) + def test_all_regions(self): + """ + Corrolary See https://github.com/OCR-D/core/issues/475 + """ + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + pcgts = parseString(f.read().encode('utf8'), silence=True) + pg = pcgts.get_Page() + self.assertEqual(len(pg.get_AllRegions()), 20) + self.assertEqual(len(pg.get_AllRegions(['Table'])), 3) + self.assertEqual(len(pg.get_AllRegions(['Text'])), 17) + if __name__ == '__main__': main() From 3445f87f027970d57227170115116cc440e624af Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 14 May 2020 14:12:44 +0200 Subject: [PATCH 02/30] Update ocrd_models/ocrd_page_user_methods.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/ocrd_page_user_methods.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 8b87138692..da77ad7f5e 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -94,13 +94,32 @@ def get_AllRegions(self, regions=None, reading_order=False): Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) """ if reading_order: - raise NotImplementedError("Ordering of regions by Reading Order not currently Implemented :(") + reading_order = self.get_ReadingOrder() if not regions: - regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Table', 'Text'] + regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] ret = [] for region in regions: ret += getattr(self, 'get_{}Region'.format(region))() - return ret + if reading_order: + reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() + if reading_order: + def get_recursive_reading_order(rogroup): + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): + elements = rogroup.get_AllIndexed() + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): + elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) + regionrefs = list() + for elem in elements: + regionrefs.append(elem.get_regionRef()) + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + regionrefs.extend(get_recursive_reading_order(elem)) + return regionrefs + reading_order = get_recursive_reading_order(reading_order) + if reading_order: + ret = dict([(region.id, region) for region in ret]) + return [ret[region_id] for region_id in reading_order if region_id in ret] + else: + return ret ''', class_names=r'^(PageType)$') # From a48b8c1a80653f106c48e57b260e6048e46235f7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 14 May 2020 15:40:10 +0200 Subject: [PATCH 03/30] update generateds page, add region filter if using reading order, wip --- .../ocrd_models/ocrd_page_generateds.py | 31 +++++++++++++++---- ocrd_models/ocrd_page_user_methods.py | 6 ++-- tests/model/test_ocrd_page.py | 8 +++-- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 0295dc8ef0..cd0e078bce 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # -# Generated Wed May 13 20:41:21 2020 by generateDS.py version 2.35.20. -# Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0] +# Generated Thu May 14 15:35:20 2020 by generateDS.py version 2.35.20. +# Python 3.6.6 (default, Jul 24 2018, 16:39:20) [GCC 4.9.2] # # Command line options: # ('-f', '') @@ -16,7 +16,7 @@ # repo/assets/data/schema/data/2019.xsd # # Command line: -# /home/kba/miniconda3/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd +# /data/monorepo/venv3.6/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd # # Current working directory (os.getcwd()): # core @@ -2857,13 +2857,32 @@ def get_AllRegions(self, regions=None, reading_order=False): Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) """ if reading_order: - raise NotImplementedError("Ordering of regions by Reading Order not currently Implemented :(") + reading_order = self.get_ReadingOrder() if not regions: - regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Table', 'Text'] + regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] ret = [] for region in regions: ret += getattr(self, 'get_{}Region'.format(region))() - return ret + if reading_order: + reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() + if reading_order: + def get_recursive_reading_order(rogroup): + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): + elements = rogroup.get_AllIndexed() + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): + elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) + regionrefs = list() + for elem in elements: + regionrefs.append(elem.get_regionRef()) + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + regionrefs.extend(get_recursive_reading_order(elem)) + return regionrefs + reading_order = get_recursive_reading_order(reading_order) + if reading_order: + ret = dict([(region.id, region) for region in ret]) + return [ret[region_id] for region_id in reading_order if region_id in ret] + else: + return ret # end class PageType diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index da77ad7f5e..bf4261022e 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -117,9 +117,9 @@ def get_recursive_reading_order(rogroup): reading_order = get_recursive_reading_order(reading_order) if reading_order: ret = dict([(region.id, region) for region in ret]) - return [ret[region_id] for region_id in reading_order if region_id in ret] - else: - return ret + ret = [ret[region_id] for region_id in reading_order if region_id in ret] + ret = [r in ret if r.__class__.__name__.replace('RegionType', '') in regions + return ret ''', class_names=r'^(PageType)$') # diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 605ee4e483..b24d1e45ff 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -184,14 +184,16 @@ def test_empty_groups_to_regionrefindexed(self): children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], RegionRefIndexedType)) - def test_all_regions(self): + def test_all_regions_without_reading_order(self): """ - Corrolary See https://github.com/OCR-D/core/issues/475 + https://github.com/OCR-D/core/pull/479 + https://github.com/OCR-D/core/issues/240#issuecomment-493135797 """ with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) pg = pcgts.get_Page() - self.assertEqual(len(pg.get_AllRegions()), 20) + self.assertEqual(len(pg.get_AllRegions()), 45) + self.assertEqual(len(pg.get_AllRegions(['Separator'])), 25) self.assertEqual(len(pg.get_AllRegions(['Table'])), 3) self.assertEqual(len(pg.get_AllRegions(['Text'])), 17) From d2a01bb799b32d89d8d04b6f695e37b9e83cd8a6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 May 2020 16:17:48 +0200 Subject: [PATCH 04/30] refactoring: move generateDS methods to their own files --- .../ocrd_models/ocrd_page_generateds.py | 131 +++++++---------- ocrd_models/ocrd_page_user_methods.py | 139 +++--------------- .../ocrd_page_user_methods/__hash__.py | 2 + .../ocrd_page_user_methods/add_AllIndexed.py | 12 ++ .../clear_AllIndexed.py | 7 + .../ocrd_page_user_methods/exportChildren.py | 20 +++ .../ocrd_page_user_methods/get_AllIndexed.py | 3 + .../ocrd_page_user_methods/get_AllRegions.py | 32 ++++ 8 files changed, 143 insertions(+), 203 deletions(-) create mode 100644 ocrd_models/ocrd_page_user_methods/__hash__.py create mode 100644 ocrd_models/ocrd_page_user_methods/add_AllIndexed.py create mode 100644 ocrd_models/ocrd_page_user_methods/clear_AllIndexed.py create mode 100644 ocrd_models/ocrd_page_user_methods/exportChildren.py create mode 100644 ocrd_models/ocrd_page_user_methods/get_AllIndexed.py create mode 100644 ocrd_models/ocrd_page_user_methods/get_AllRegions.py diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index cd0e078bce..8a3705a3e4 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # -# Generated Thu May 14 15:35:20 2020 by generateDS.py version 2.35.20. -# Python 3.6.6 (default, Jul 24 2018, 16:39:20) [GCC 4.9.2] +# Generated Fri May 15 16:15:27 2020 by generateDS.py version 2.35.20. +# Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0] # # Command line options: # ('-f', '') @@ -16,7 +16,7 @@ # repo/assets/data/schema/data/2019.xsd # # Command line: -# /data/monorepo/venv3.6/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd +# /home/kba/miniconda3/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd # # Current working directory (os.getcwd()): # core @@ -2850,40 +2850,7 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'CustomRegion' def __hash__(self): return hash(self.id) - - def get_AllRegions(self, regions=None, reading_order=False): - """" - Get all the *Region element or only those provided by ``regions``. - Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) - """ - if reading_order: - reading_order = self.get_ReadingOrder() - if not regions: - regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] - ret = [] - for region in regions: - ret += getattr(self, 'get_{}Region'.format(region))() - if reading_order: - reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() - if reading_order: - def get_recursive_reading_order(rogroup): - if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): - elements = rogroup.get_AllIndexed() - if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): - elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) - regionrefs = list() - for elem in elements: - regionrefs.append(elem.get_regionRef()) - if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): - regionrefs.extend(get_recursive_reading_order(elem)) - return regionrefs - reading_order = get_recursive_reading_order(reading_order) - if reading_order: - ret = dict([(region.id, region) for region in ret]) - return [ret[region_id] for region_id in reading_order if region_id in ret] - else: - return ret - # end class PageType +# end class PageType class CoordsType(GeneratedsSuper): @@ -5380,7 +5347,28 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroupIndexed' def __hash__(self): return hash(self.id) - + def get_AllIndexed(self): + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + + def clear_AllIndexed(self): + ret = self.get_AllIndexed() + self.set_RegionRefIndexed([]) + self.set_OrderedGroupIndexed([]) + self.set_UnorderedGroupIndexed([]) + return ret + + def add_AllIndexed(self, elements): + if not isinstance(elements, list): + elements = [elements] + for element in sorted(elements, key=lambda x : x.index): + if isinstance(element, RegionRefIndexedType): + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): eol_ = '\n' if pretty_print else '' namespaceprefix_ = 'pc:' @@ -5400,27 +5388,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned.append(entry) for entry in cleaned: entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) - - def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) - def add_AllIndexed(self, elements): - if not isinstance(elements, list): - elements = [elements] - for element in sorted(elements, key=lambda x : x.index): - if isinstance(element, RegionRefIndexedType): - self.add_RegionRefIndexed(element) - elif isinstance(element, OrderedGroupIndexedType): - self.add_OrderedGroupIndexed(element) - elif isinstance(element, UnorderedGroupIndexedType): - self.add_UnorderedGroupIndexed(element) - return self.get_AllIndexed() - - def clear_AllIndexed(self): - ret = self.get_AllIndexed() - self.set_RegionRefIndexed([]) - self.set_OrderedGroupIndexed([]) - self.set_UnorderedGroupIndexed([]) - return ret + # end class OrderedGroupIndexedType @@ -6169,7 +6137,28 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroupIndexed' def __hash__(self): return hash(self.id) - + def get_AllIndexed(self): + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + + def clear_AllIndexed(self): + ret = self.get_AllIndexed() + self.set_RegionRefIndexed([]) + self.set_OrderedGroupIndexed([]) + self.set_UnorderedGroupIndexed([]) + return ret + + def add_AllIndexed(self, elements): + if not isinstance(elements, list): + elements = [elements] + for element in sorted(elements, key=lambda x : x.index): + if isinstance(element, RegionRefIndexedType): + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): eol_ = '\n' if pretty_print else '' namespaceprefix_ = 'pc:' @@ -6189,27 +6178,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned.append(entry) for entry in cleaned: entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) - - def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) - def add_AllIndexed(self, elements): - if not isinstance(elements, list): - elements = [elements] - for element in sorted(elements, key=lambda x : x.index): - if isinstance(element, RegionRefIndexedType): - self.add_RegionRefIndexed(element) - elif isinstance(element, OrderedGroupIndexedType): - self.add_OrderedGroupIndexed(element) - elif isinstance(element, UnorderedGroupIndexedType): - self.add_UnorderedGroupIndexed(element) - return self.get_AllIndexed() - - def clear_AllIndexed(self): - ret = self.get_AllIndexed() - self.set_RegionRefIndexed([]) - self.set_OrderedGroupIndexed([]) - self.set_UnorderedGroupIndexed([]) - return ret + # end class OrderedGroupType diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index bf4261022e..7f93323794 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -3,6 +3,8 @@ # source: https://bitbucket.org/dkuhlman/generateds/src/default/gends_user_methods.py import re +import codecs +from os.path import dirname, join # # You must include the following class definition at the top of @@ -80,134 +82,27 @@ def show(self): # generated superclass file and also section "User Methods" in # the documentation, as well as the examples below. -# -# Replace the following method specifications with your own. - -# -# List all *Regions on the PAGE -# -get_AllRegions = MethodSpec(name='get_AllRegions', - source=r''' - def get_AllRegions(self, regions=None, reading_order=False): - """" - Get all the *Region element or only those provided by ``regions``. - Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) - """ - if reading_order: - reading_order = self.get_ReadingOrder() - if not regions: - regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] - ret = [] - for region in regions: - ret += getattr(self, 'get_{}Region'.format(region))() - if reading_order: - reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() - if reading_order: - def get_recursive_reading_order(rogroup): - if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): - elements = rogroup.get_AllIndexed() - if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): - elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) - regionrefs = list() - for elem in elements: - regionrefs.append(elem.get_regionRef()) - if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): - regionrefs.extend(get_recursive_reading_order(elem)) - return regionrefs - reading_order = get_recursive_reading_order(reading_order) - if reading_order: - ret = dict([(region.id, region) for region in ret]) - ret = [ret[region_id] for region_id in reading_order if region_id in ret] - ret = [r in ret if r.__class__.__name__.replace('RegionType', '') in regions - return ret - ''', class_names=r'^(PageType)$') - -# -# List all *Indexed children sorted by @index -# -get_AllIndexed = MethodSpec(name='get_AllIndexed', - source=r''' - def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) ''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') - -# -# Clear all *Indexed children sorted by @index -# -clear_AllIndexed = MethodSpec(name='clear_AllIndexed', - source=r''' - def clear_AllIndexed(self): - ret = self.get_AllIndexed() - self.set_RegionRefIndexed([]) - self.set_OrderedGroupIndexed([]) - self.set_UnorderedGroupIndexed([]) - return ret -''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') - -# -# Add all *Indexed children sorted by @index -# -add_AllIndexed = MethodSpec(name='add_AllIndexed', - source=r''' - def add_AllIndexed(self, elements): - if not isinstance(elements, list): - elements = [elements] - for element in sorted(elements, key=lambda x : x.index): - if isinstance(element, RegionRefIndexedType): - self.add_RegionRefIndexed(element) - elif isinstance(element, OrderedGroupIndexedType): - self.add_OrderedGroupIndexed(element) - elif isinstance(element, UnorderedGroupIndexedType): - self.add_UnorderedGroupIndexed(element) - return self.get_AllIndexed() -''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') - +def _add_method(class_re, method_name): + """ + Loads a file ./ocrd_page_user_methods/{{ method_name }}.py and defines a MethodSpec applying to class_re + """ + source = [] + with codecs.open(join(dirname(__file__), 'ocrd_page_user_methods', '%s.py' % method_name)) as f: + for line in f.readlines(): + source.append(' %s' % line if line else line) + return MethodSpec(name=method_name, class_names=class_re, source=''.join(source)) -# -# export children sorted by index of the childelement -# -exportChildren = MethodSpec(name='exportChildren', - source=r''' - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): - eol_ = '\n' if pretty_print else '' - namespaceprefix_ = 'pc:' - if self.UserDefined is not None: - self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) - for Labels_ in self.Labels: - Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - cleaned = [] - # remove emtpy groups and replace with RegionRefIndexedType - for entry in self.get_AllIndexed(): - if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): - rri = RegionRefIndexedType.factory(parent_object_=self) - rri.index = entry.index - rri.regionRef = entry.regionRef - cleaned.append(rri) - else: - cleaned.append(entry) - for entry in cleaned: - entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) -''', class_names=r'^(OrderedGroupType|OrderedGroupIndexedType)$') -# -# Hash by memory adress/id() -# -hash_by_id = MethodSpec(name='hash', - source='''\ - def __hash__(self): - return hash(self.id) -''', - class_names=r'^.*$', - ) # # Provide a list of your method specifications. # This list of specifications must be named METHOD_SPECS. # METHOD_SPECS = ( - hash_by_id, - exportChildren, - get_AllIndexed, - add_AllIndexed, - get_AllRegions, - clear_AllIndexed, + _add_method(r'^.*$', '__hash__'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'get_AllIndexed'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'clear_AllIndexed'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'add_AllIndexed'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren'), + _add_method(r'^(PageType)$', 'get_AllRegions'), ) diff --git a/ocrd_models/ocrd_page_user_methods/__hash__.py b/ocrd_models/ocrd_page_user_methods/__hash__.py new file mode 100644 index 0000000000..dc34db5932 --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/__hash__.py @@ -0,0 +1,2 @@ +def __hash__(self): + return hash(self.id) diff --git a/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py new file mode 100644 index 0000000000..36e90ea9f6 --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py @@ -0,0 +1,12 @@ +def add_AllIndexed(self, elements): + if not isinstance(elements, list): + elements = [elements] + for element in sorted(elements, key=lambda x : x.index): + if isinstance(element, RegionRefIndexedType): + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + diff --git a/ocrd_models/ocrd_page_user_methods/clear_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/clear_AllIndexed.py new file mode 100644 index 0000000000..a213e53852 --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/clear_AllIndexed.py @@ -0,0 +1,7 @@ +def clear_AllIndexed(self): + ret = self.get_AllIndexed() + self.set_RegionRefIndexed([]) + self.set_OrderedGroupIndexed([]) + self.set_UnorderedGroupIndexed([]) + return ret + diff --git a/ocrd_models/ocrd_page_user_methods/exportChildren.py b/ocrd_models/ocrd_page_user_methods/exportChildren.py new file mode 100644 index 0000000000..9f53f66dde --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/exportChildren.py @@ -0,0 +1,20 @@ +def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): + eol_ = '\n' if pretty_print else '' + namespaceprefix_ = 'pc:' + if self.UserDefined is not None: + self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) + for Labels_ in self.Labels: + Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) + cleaned = [] + # remove emtpy groups and replace with RegionRefIndexedType + for entry in self.get_AllIndexed(): + if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): + rri = RegionRefIndexedType.factory(parent_object_=self) + rri.index = entry.index + rri.regionRef = entry.regionRef + cleaned.append(rri) + else: + cleaned.append(entry) + for entry in cleaned: + entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + diff --git a/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py new file mode 100644 index 0000000000..4d79632353 --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py @@ -0,0 +1,3 @@ +def get_AllIndexed(self): + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py new file mode 100644 index 0000000000..5fb645e350 --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -0,0 +1,32 @@ +def get_AllRegions(self, regions=None, reading_order=False): + """" + Get all the *Region element or only those provided by ``regions``. + Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) + """ + if reading_order: + reading_order = self.get_ReadingOrder() + if not regions: + regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] + ret = [] + for region in regions: + ret += getattr(self, 'get_{}Region'.format(region))() + if reading_order: + reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() + if reading_order: + def get_recursive_reading_order(rogroup): + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): + elements = rogroup.get_AllIndexed() + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): + elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) + regionrefs = list() + for elem in elements: + regionrefs.append(elem.get_regionRef()) + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + regionrefs.extend(get_recursive_reading_order(elem)) + return regionrefs + reading_order = get_recursive_reading_order(reading_order) + if reading_order: + ret = dict([(region.id, region) for region in ret]) + ret = [ret[region_id] for region_id in reading_order if region_id in ret] + ret = [r in ret if r.__class__.__name__.replace('RegionType', '') in regions + return ret From be7f026a8c1e8e213e53b1e3f562596fc876fa06 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 May 2020 16:50:55 +0200 Subject: [PATCH 05/30] get_AllRegions: adapt to signature proposed in #240, test with order=reading-order --- .../ocrd_models/ocrd_page_generateds.py | 39 +++++++++++- .../ocrd_page_user_methods/get_AllRegions.py | 59 ++++++++++--------- tests/model/test_ocrd_page.py | 21 ++++++- 3 files changed, 88 insertions(+), 31 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 8a3705a3e4..f6152110b1 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 15 16:15:27 2020 by generateDS.py version 2.35.20. +# Generated Fri May 15 16:48:56 2020 by generateDS.py version 2.35.20. # Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0] # # Command line options: @@ -2850,6 +2850,43 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'CustomRegion' def __hash__(self): return hash(self.id) + def get_AllRegions(self, classes=None, order='document'): + """ + Get all the *Region element or only those provided by ``classes``. + Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) + Arguments: + classes (list) Classes of regions that shall be returned, e.g. ['Text', 'Image'] + order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order + """ + if order not in ['document', 'reading-order']: + raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) + if not classes: + classes = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] + def region_class(x): + return x.__class__.__name__.replace('RegionType', '') + ret = [] + for region in classes: + ret += getattr(self, 'get_{}Region'.format(region))() + if order == 'reading-order': + reading_order = self.get_ReadingOrder().get_OrderedGroup() or reading_order.get_UnorderedGroup() + if reading_order: + def get_recursive_reading_order(rogroup): + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): + elements = rogroup.get_AllIndexed() + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): + elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) + regionrefs = list() + for elem in elements: + regionrefs.append(elem.get_regionRef()) + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + regionrefs.extend(get_recursive_reading_order(elem)) + return regionrefs + reading_order = get_recursive_reading_order(reading_order) + if reading_order: + id2region = dict([(region.id, region) for region in ret]) + ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] + ret = [r for r in ret if region_class(r) in classes] + return ret # end class PageType diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 5fb645e350..d8f8cf8c2d 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -1,32 +1,37 @@ -def get_AllRegions(self, regions=None, reading_order=False): - """" - Get all the *Region element or only those provided by ``regions``. +def get_AllRegions(self, classes=None, order='document'): + """ + Get all the *Region element or only those provided by ``classes``. Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) + Arguments: + classes (list) Classes of regions that shall be returned, e.g. ['Text', 'Image'] + order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order """ - if reading_order: - reading_order = self.get_ReadingOrder() - if not regions: - regions = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] + if order not in ['document', 'reading-order']: + raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) + if not classes: + classes = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] + def region_class(x): + return x.__class__.__name__.replace('RegionType', '') ret = [] - for region in regions: + for region in classes: ret += getattr(self, 'get_{}Region'.format(region))() - if reading_order: - reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() - if reading_order: - def get_recursive_reading_order(rogroup): - if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): - elements = rogroup.get_AllIndexed() - if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): - elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) - regionrefs = list() - for elem in elements: - regionrefs.append(elem.get_regionRef()) - if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): - regionrefs.extend(get_recursive_reading_order(elem)) - return regionrefs - reading_order = get_recursive_reading_order(reading_order) - if reading_order: - ret = dict([(region.id, region) for region in ret]) - ret = [ret[region_id] for region_id in reading_order if region_id in ret] - ret = [r in ret if r.__class__.__name__.replace('RegionType', '') in regions + if order == 'reading-order': + reading_order = self.get_ReadingOrder().get_OrderedGroup() or reading_order.get_UnorderedGroup() + if reading_order: + def get_recursive_reading_order(rogroup): + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): + elements = rogroup.get_AllIndexed() + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): + elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) + regionrefs = list() + for elem in elements: + regionrefs.append(elem.get_regionRef()) + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + regionrefs.extend(get_recursive_reading_order(elem)) + return regionrefs + reading_order = get_recursive_reading_order(reading_order) + if reading_order: + id2region = dict([(region.id, region) for region in ret]) + ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] + ret = [r for r in ret if region_class(r) in classes] return ret diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index b24d1e45ff..7ee58e421f 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -193,9 +193,24 @@ def test_all_regions_without_reading_order(self): pcgts = parseString(f.read().encode('utf8'), silence=True) pg = pcgts.get_Page() self.assertEqual(len(pg.get_AllRegions()), 45) - self.assertEqual(len(pg.get_AllRegions(['Separator'])), 25) - self.assertEqual(len(pg.get_AllRegions(['Table'])), 3) - self.assertEqual(len(pg.get_AllRegions(['Text'])), 17) + self.assertEqual(len(pg.get_AllRegions(classes=['Separator'])), 25) + self.assertEqual(len(pg.get_AllRegions(classes=['Table'])), 3) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'])), 17) + + def test_all_regions_with_reading_order(self): + """ + https://github.com/OCR-D/core/pull/479 + https://github.com/OCR-D/core/issues/240#issuecomment-493135797 + """ + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + pcgts = parseString(f.read().encode('utf8'), silence=True) + pg = pcgts.get_Page() + with self.assertRaisesRegex(Exception, "Argument 'order' must be either 'document' or 'reading-order', not 'random'"): + pg.get_AllRegions(order='random') + self.assertEqual(len(pg.get_AllRegions(order='reading-order')), 20) + self.assertEqual(len(pg.get_AllRegions(classes=['Table'], order='reading-order')), 3) + self.assertEqual(len(pg.get_AllRegions(classes=['Table'], order='reading-order')), 3) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order')), 17) if __name__ == '__main__': main() From e1740f78ac80d5ce48de6f8d0d8905fd737feb44 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 May 2020 16:58:19 +0200 Subject: [PATCH 06/30] README: explain how to add user methods to PAGE API --- ocrd_models/README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/ocrd_models/README.md b/ocrd_models/README.md index 9564566c9f..c98d69b9b8 100644 --- a/ocrd_models/README.md +++ b/ocrd_models/README.md @@ -3,3 +3,30 @@ > OCR-D framework - file format APIs and schemas See https://github.com/OCR-D/core + +## Adding user methods to the generated PAGE API + +Let's say you want to add a method `get_FirstTextRegion` on the pc:Page element: + +1. Create a file `ocrd_models/ocrd_page_user_methods/get_FirstTextRegion.py` + +```python +def getFirstTextRegion(self) + return self.get_TextRegion[0] +``` + +2. Edit `ocrd_models/ocrd_page_user_methods.py` and append to the `METHOD_SPECS` list: + +```python +METHOD_SPECS = ( + # ... + _add_method(r'^PageType$', 'get_TextRegion') + # ... +) +``` + +3. Regenerate the PAGE API: + +```sh +make generate-page +``` From 6f9163ee8e64c70c1f937994af9f8e611beb8c71 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 May 2020 13:39:39 +0200 Subject: [PATCH 07/30] Update ocrd_models/README.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/README.md b/ocrd_models/README.md index c98d69b9b8..47901e654c 100644 --- a/ocrd_models/README.md +++ b/ocrd_models/README.md @@ -20,7 +20,7 @@ def getFirstTextRegion(self) ```python METHOD_SPECS = ( # ... - _add_method(r'^PageType$', 'get_TextRegion') + _add_method(r'^PageType$', 'get_FirstTextRegion') # ... ) ``` From 0c73b3e7f56639c0d7e545dbb890117e2e67f76f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 May 2020 13:40:14 +0200 Subject: [PATCH 08/30] Update ocrd_models/README.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/README.md b/ocrd_models/README.md index 47901e654c..44b6e95a65 100644 --- a/ocrd_models/README.md +++ b/ocrd_models/README.md @@ -11,7 +11,7 @@ Let's say you want to add a method `get_FirstTextRegion` on the pc:Page element: 1. Create a file `ocrd_models/ocrd_page_user_methods/get_FirstTextRegion.py` ```python -def getFirstTextRegion(self) +def get_FirstTextRegion(self): return self.get_TextRegion[0] ``` From 5c2f3a8028e54322fdccfe7a9578319a8582b785 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 May 2020 13:40:59 +0200 Subject: [PATCH 09/30] Update ocrd_models/README.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_models/README.md b/ocrd_models/README.md index 44b6e95a65..7e6ec6216b 100644 --- a/ocrd_models/README.md +++ b/ocrd_models/README.md @@ -15,6 +15,7 @@ def get_FirstTextRegion(self): return self.get_TextRegion[0] ``` + (Note that the method name and file name must be identical.) 2. Edit `ocrd_models/ocrd_page_user_methods.py` and append to the `METHOD_SPECS` list: ```python From 6a575061b383edb213e580beb37d82349b2c4e99 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 May 2020 13:51:45 +0200 Subject: [PATCH 10/30] recursion (with both finite or arbitrary depth) for get_AllRegions Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- .../ocrd_page_user_methods/get_AllRegions.py | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index d8f8cf8c2d..bda88180e2 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -8,15 +8,33 @@ def get_AllRegions(self, classes=None, order='document'): """ if order not in ['document', 'reading-order']: raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) - if not classes: - classes = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] def region_class(x): return x.__class__.__name__.replace('RegionType', '') - ret = [] - for region in classes: - ret += getattr(self, 'get_{}Region'.format(region))() + def get_recursive_regions(regions, level): + if level == 1: + # stop recursion, filter classes + if classes: + return [r for r in regions if region_class(r) in classes] + else: + return regions + # find more regions recursively + more_regions = [] + for region in regions: + more_regions.append([]) + for class_ in ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown']: + if class_ == 'Map' and not isinstance(region, PageType): + # 'Map' is not recursive in 2019 schema + continue + more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() + if not any(more_regions): + return get_recursive_regions(regions, 1) + regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] + return get_recursive_regions(regions, level - 1 if level else 0) + ret = get_recursive_regions([self], depth + 1 if depth else 0) if order == 'reading-order': - reading_order = self.get_ReadingOrder().get_OrderedGroup() or reading_order.get_UnorderedGroup() + reading_order = self.get_ReadingOrder() + if reading_order: + reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() if reading_order: def get_recursive_reading_order(rogroup): if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): @@ -33,5 +51,4 @@ def get_recursive_reading_order(rogroup): if reading_order: id2region = dict([(region.id, region) for region in ret]) ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] - ret = [r for r in ret if region_class(r) in classes] return ret From a9072c8a92dfee62f72a1c04afd3413839e128e9 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 28 May 2020 14:26:15 +0200 Subject: [PATCH 11/30] regenerate PAGE API --- .../ocrd_models/ocrd_page_generateds.py | 39 +++++++++++++------ .../ocrd_page_user_methods/get_AllRegions.py | 2 +- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index f6152110b1..0d6d27940c 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 15 16:48:56 2020 by generateDS.py version 2.35.20. -# Python 3.7.6 (default, Jan 8 2020, 19:59:22) [GCC 7.3.0] +# Generated Thu May 28 14:25:40 2020 by generateDS.py version 2.35.20. +# Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: # ('-f', '') @@ -16,7 +16,7 @@ # repo/assets/data/schema/data/2019.xsd # # Command line: -# /home/kba/miniconda3/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd +# /home/kba/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" repo/assets/data/schema/data/2019.xsd # # Current working directory (os.getcwd()): # core @@ -2853,22 +2853,40 @@ def __hash__(self): def get_AllRegions(self, classes=None, order='document'): """ Get all the *Region element or only those provided by ``classes``. - Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) + Returned in document order unless ``order`` is ``reading-order`` is set (NOT CURRENTLY IMPLEMENTED) Arguments: classes (list) Classes of regions that shall be returned, e.g. ['Text', 'Image'] order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order """ if order not in ['document', 'reading-order']: raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) - if not classes: - classes = ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown'] def region_class(x): return x.__class__.__name__.replace('RegionType', '') - ret = [] - for region in classes: - ret += getattr(self, 'get_{}Region'.format(region))() + def get_recursive_regions(regions, level): + if level == 1: + # stop recursion, filter classes + if classes: + return [r for r in regions if region_class(r) in classes] + else: + return regions + # find more regions recursively + more_regions = [] + for region in regions: + more_regions.append([]) + for class_ in ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown']: + if class_ == 'Map' and not isinstance(region, PageType): + # 'Map' is not recursive in 2019 schema + continue + more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() + if not any(more_regions): + return get_recursive_regions(regions, 1) + regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] + return get_recursive_regions(regions, level - 1 if level else 0) + ret = get_recursive_regions([self], depth + 1 if depth else 0) if order == 'reading-order': - reading_order = self.get_ReadingOrder().get_OrderedGroup() or reading_order.get_UnorderedGroup() + reading_order = self.get_ReadingOrder() + if reading_order: + reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() if reading_order: def get_recursive_reading_order(rogroup): if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): @@ -2885,7 +2903,6 @@ def get_recursive_reading_order(rogroup): if reading_order: id2region = dict([(region.id, region) for region in ret]) ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] - ret = [r for r in ret if region_class(r) in classes] return ret # end class PageType diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index bda88180e2..62dbe74e35 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -1,7 +1,7 @@ def get_AllRegions(self, classes=None, order='document'): """ Get all the *Region element or only those provided by ``classes``. - Returned in random order unless ``reading_order`` is set (NOT CURRENTLY IMPLEMENTED) + Returned in document order unless ``order`` is ``reading-order`` is set (NOT CURRENTLY IMPLEMENTED) Arguments: classes (list) Classes of regions that shall be returned, e.g. ['Text', 'Image'] order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order From ac62b857476967b44f7d81115424ca9c51fdbf93 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 28 May 2020 15:59:05 +0200 Subject: [PATCH 12/30] get_AllRegions: clean-up merge artifacts and reorganize --- .../ocrd_models/ocrd_page_generateds.py | 88 +++++++++++-------- .../ocrd_page_user_methods/get_AllRegions.py | 86 ++++++++++-------- 2 files changed, 97 insertions(+), 77 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 0d6d27940c..aac3997445 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Thu May 28 14:25:40 2020 by generateDS.py version 2.35.20. +# Generated Thu May 28 15:57:42 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2850,58 +2850,68 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'CustomRegion' def __hash__(self): return hash(self.id) - def get_AllRegions(self, classes=None, order='document'): + # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring + def _region_class(self, x): # pylint: disable=unused-argument + return x.__class__.__name__.replace('RegionType', '') + + def _region_id(self, x): # pylint: disable=unused-argument + return x.id if hasattr(x, 'id') else x.pcGtsId + + def _get_recursive_regions(self, regions, level, classes=None): + if level == 1: + # stop recursion, filter classes + if classes: + return [r for r in regions if self._region_class(r) in classes] + # remove the first element (PageType) + return regions[1:] + # find more regions recursively + more_regions = [] + for region in regions: + more_regions.append([]) + for class_ in ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', + 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', + 'Separator', 'Table', 'Text', 'Unknown']: + if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable + # 'Map' is not recursive in 2019 schema + continue + more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() + if not any(more_regions): + return _get_recursive_regions(regions, 1, classes) + regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] + return self._get_recursive_regions(regions, level - 1 if level else 0, classes) + + def _get_recursive_reading_order(self, rogroup): + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable + elements = rogroup.get_AllIndexed() + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable + elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) + regionrefs = list() + for elem in elements: + regionrefs.append(elem.get_regionRef()) + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable + regionrefs.extend(self._get_recursive_reading_order(elem)) + return regionrefs + + def get_AllRegions(self, classes=None, order='document', depth=1): """ Get all the *Region element or only those provided by ``classes``. Returned in document order unless ``order`` is ``reading-order`` is set (NOT CURRENTLY IMPLEMENTED) Arguments: - classes (list) Classes of regions that shall be returned, e.g. ['Text', 'Image'] + classes (list) Classes of regions that shall be returned, e.g. ``['Text', 'Image']`` order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order + depth (int) Recursive depth to look for regions at. Default: 1 """ if order not in ['document', 'reading-order']: raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) - def region_class(x): - return x.__class__.__name__.replace('RegionType', '') - def get_recursive_regions(regions, level): - if level == 1: - # stop recursion, filter classes - if classes: - return [r for r in regions if region_class(r) in classes] - else: - return regions - # find more regions recursively - more_regions = [] - for region in regions: - more_regions.append([]) - for class_ in ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown']: - if class_ == 'Map' and not isinstance(region, PageType): - # 'Map' is not recursive in 2019 schema - continue - more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() - if not any(more_regions): - return get_recursive_regions(regions, 1) - regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] - return get_recursive_regions(regions, level - 1 if level else 0) - ret = get_recursive_regions([self], depth + 1 if depth else 0) + ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) if order == 'reading-order': reading_order = self.get_ReadingOrder() if reading_order: reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() if reading_order: - def get_recursive_reading_order(rogroup): - if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): - elements = rogroup.get_AllIndexed() - if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): - elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) - regionrefs = list() - for elem in elements: - regionrefs.append(elem.get_regionRef()) - if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): - regionrefs.extend(get_recursive_reading_order(elem)) - return regionrefs - reading_order = get_recursive_reading_order(reading_order) + reading_order = self._get_recursive_reading_order(reading_order) if reading_order: - id2region = dict([(region.id, region) for region in ret]) + id2region = dict([(self._region_id(region), region) for region in ret]) # pylint: disable=consider-using-dict-comprehension ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] return ret # end class PageType diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 62dbe74e35..6f3b2342a9 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -1,54 +1,64 @@ -def get_AllRegions(self, classes=None, order='document'): +# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring +def _region_class(self, x): # pylint: disable=unused-argument + return x.__class__.__name__.replace('RegionType', '') + +def _region_id(self, x): # pylint: disable=unused-argument + return x.id if hasattr(x, 'id') else x.pcGtsId + +def _get_recursive_regions(self, regions, level, classes=None): + if level == 1: + # stop recursion, filter classes + if classes: + return [r for r in regions if self._region_class(r) in classes] + # remove the first element (PageType) + return regions[1:] + # find more regions recursively + more_regions = [] + for region in regions: + more_regions.append([]) + for class_ in ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', + 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', + 'Separator', 'Table', 'Text', 'Unknown']: + if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable + # 'Map' is not recursive in 2019 schema + continue + more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() + if not any(more_regions): + return _get_recursive_regions(regions, 1, classes) + regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] + return self._get_recursive_regions(regions, level - 1 if level else 0, classes) + +def _get_recursive_reading_order(self, rogroup): + if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable + elements = rogroup.get_AllIndexed() + if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable + elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) + regionrefs = list() + for elem in elements: + regionrefs.append(elem.get_regionRef()) + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable + regionrefs.extend(self._get_recursive_reading_order(elem)) + return regionrefs + +def get_AllRegions(self, classes=None, order='document', depth=1): """ Get all the *Region element or only those provided by ``classes``. Returned in document order unless ``order`` is ``reading-order`` is set (NOT CURRENTLY IMPLEMENTED) Arguments: - classes (list) Classes of regions that shall be returned, e.g. ['Text', 'Image'] + classes (list) Classes of regions that shall be returned, e.g. ``['Text', 'Image']`` order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order + depth (int) Recursive depth to look for regions at. Default: 1 """ if order not in ['document', 'reading-order']: raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) - def region_class(x): - return x.__class__.__name__.replace('RegionType', '') - def get_recursive_regions(regions, level): - if level == 1: - # stop recursion, filter classes - if classes: - return [r for r in regions if region_class(r) in classes] - else: - return regions - # find more regions recursively - more_regions = [] - for region in regions: - more_regions.append([]) - for class_ in ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown']: - if class_ == 'Map' and not isinstance(region, PageType): - # 'Map' is not recursive in 2019 schema - continue - more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() - if not any(more_regions): - return get_recursive_regions(regions, 1) - regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] - return get_recursive_regions(regions, level - 1 if level else 0) - ret = get_recursive_regions([self], depth + 1 if depth else 0) + ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) if order == 'reading-order': reading_order = self.get_ReadingOrder() if reading_order: reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() if reading_order: - def get_recursive_reading_order(rogroup): - if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): - elements = rogroup.get_AllIndexed() - if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): - elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) - regionrefs = list() - for elem in elements: - regionrefs.append(elem.get_regionRef()) - if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): - regionrefs.extend(get_recursive_reading_order(elem)) - return regionrefs - reading_order = get_recursive_reading_order(reading_order) + reading_order = self._get_recursive_reading_order(reading_order) if reading_order: - id2region = dict([(region.id, region) for region in ret]) + id2region = dict([(self._region_id(region), region) for region in ret]) # pylint: disable=consider-using-dict-comprehension ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] return ret From fd6d54541248ca605f849abf72395eb657658c59 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 May 2020 16:09:05 +0200 Subject: [PATCH 13/30] Update ocrd_models/ocrd_page_user_methods/get_AllRegions.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/ocrd_page_user_methods/get_AllRegions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 6f3b2342a9..b7eb4e68bb 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -43,7 +43,7 @@ def _get_recursive_reading_order(self, rogroup): def get_AllRegions(self, classes=None, order='document', depth=1): """ Get all the *Region element or only those provided by ``classes``. - Returned in document order unless ``order`` is ``reading-order`` is set (NOT CURRENTLY IMPLEMENTED) + Returned in document order unless ``order`` is ``reading-order`` Arguments: classes (list) Classes of regions that shall be returned, e.g. ``['Text', 'Image']`` order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order From 86a7133579a11dfd3dac100d1ce38ca659d601bb Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 28 May 2020 16:10:17 +0200 Subject: [PATCH 14/30] get_AllRegions: _region_id method unneccessary now --- ocrd_models/ocrd_page_user_methods/get_AllRegions.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 6f3b2342a9..ace12ff622 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -2,9 +2,6 @@ def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') -def _region_id(self, x): # pylint: disable=unused-argument - return x.id if hasattr(x, 'id') else x.pcGtsId - def _get_recursive_regions(self, regions, level, classes=None): if level == 1: # stop recursion, filter classes @@ -59,6 +56,6 @@ def get_AllRegions(self, classes=None, order='document', depth=1): if reading_order: reading_order = self._get_recursive_reading_order(reading_order) if reading_order: - id2region = dict([(self._region_id(region), region) for region in ret]) # pylint: disable=consider-using-dict-comprehension + id2region = dict([(region.id, region) for region in ret]) # pylint: disable=consider-using-dict-comprehension ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] return ret From 5c8d89bc8a9ed6815202a6e3281ad97b81e87f20 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 28 May 2020 16:11:16 +0200 Subject: [PATCH 15/30] regenerate PAGE API --- ocrd_models/ocrd_models/ocrd_page_generateds.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index aac3997445..5725af42b5 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Thu May 28 15:57:42 2020 by generateDS.py version 2.35.20. +# Generated Thu May 28 16:10:37 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2854,9 +2854,6 @@ def __hash__(self): def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') - def _region_id(self, x): # pylint: disable=unused-argument - return x.id if hasattr(x, 'id') else x.pcGtsId - def _get_recursive_regions(self, regions, level, classes=None): if level == 1: # stop recursion, filter classes @@ -2895,7 +2892,7 @@ def _get_recursive_reading_order(self, rogroup): def get_AllRegions(self, classes=None, order='document', depth=1): """ Get all the *Region element or only those provided by ``classes``. - Returned in document order unless ``order`` is ``reading-order`` is set (NOT CURRENTLY IMPLEMENTED) + Returned in document order unless ``order`` is ``reading-order`` Arguments: classes (list) Classes of regions that shall be returned, e.g. ``['Text', 'Image']`` order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order @@ -2911,7 +2908,7 @@ def get_AllRegions(self, classes=None, order='document', depth=1): if reading_order: reading_order = self._get_recursive_reading_order(reading_order) if reading_order: - id2region = dict([(self._region_id(region), region) for region in ret]) # pylint: disable=consider-using-dict-comprehension + id2region = dict([(region.id, region) for region in ret]) # pylint: disable=consider-using-dict-comprehension ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] return ret # end class PageType From f6e3da5fb6162f9ecae31cd76c04d406e849e3ff Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 28 May 2020 17:03:19 +0200 Subject: [PATCH 16/30] :art: pylint --- .../ocrd_models/ocrd_page_generateds.py | 38 +++++++++---------- .../ocrd_page_user_methods/add_AllIndexed.py | 9 +++-- .../ocrd_page_user_methods/exportChildren.py | 9 ++--- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 5725af42b5..a9c0484dc7 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Thu May 28 16:10:37 2020 by generateDS.py version 2.35.20. +# Generated Thu May 28 17:02:23 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -5418,20 +5418,21 @@ def clear_AllIndexed(self): self.set_UnorderedGroupIndexed([]) return ret + # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring def add_AllIndexed(self, elements): if not isinstance(elements, list): elements = [elements] - for element in sorted(elements, key=lambda x : x.index): - if isinstance(element, RegionRefIndexedType): + for element in sorted(elements, key=lambda x: x.index): + if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable self.add_RegionRefIndexed(element) - elif isinstance(element, OrderedGroupIndexedType): + elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable self.add_OrderedGroupIndexed(element) - elif isinstance(element, UnorderedGroupIndexedType): + elif isinstance(element, UnorderedGroupIndexedType): # pylint: disable=undefined-variable self.add_UnorderedGroupIndexed(element) return self.get_AllIndexed() - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): - eol_ = '\n' if pretty_print else '' + # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments namespaceprefix_ = 'pc:' if self.UserDefined is not None: self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) @@ -5440,8 +5441,8 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned = [] # remove emtpy groups and replace with RegionRefIndexedType for entry in self.get_AllIndexed(): - if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): - rri = RegionRefIndexedType.factory(parent_object_=self) + if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): # pylint: disable=undefined-variable + rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable rri.index = entry.index rri.regionRef = entry.regionRef cleaned.append(rri) @@ -5449,7 +5450,6 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned.append(entry) for entry in cleaned: entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) - # end class OrderedGroupIndexedType @@ -6208,20 +6208,21 @@ def clear_AllIndexed(self): self.set_UnorderedGroupIndexed([]) return ret + # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring def add_AllIndexed(self, elements): if not isinstance(elements, list): elements = [elements] - for element in sorted(elements, key=lambda x : x.index): - if isinstance(element, RegionRefIndexedType): + for element in sorted(elements, key=lambda x: x.index): + if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable self.add_RegionRefIndexed(element) - elif isinstance(element, OrderedGroupIndexedType): + elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable self.add_OrderedGroupIndexed(element) - elif isinstance(element, UnorderedGroupIndexedType): + elif isinstance(element, UnorderedGroupIndexedType): # pylint: disable=undefined-variable self.add_UnorderedGroupIndexed(element) return self.get_AllIndexed() - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): - eol_ = '\n' if pretty_print else '' + # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring + def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments namespaceprefix_ = 'pc:' if self.UserDefined is not None: self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) @@ -6230,8 +6231,8 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned = [] # remove emtpy groups and replace with RegionRefIndexedType for entry in self.get_AllIndexed(): - if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): - rri = RegionRefIndexedType.factory(parent_object_=self) + if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): # pylint: disable=undefined-variable + rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable rri.index = entry.index rri.regionRef = entry.regionRef cleaned.append(rri) @@ -6239,7 +6240,6 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned.append(entry) for entry in cleaned: entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) - # end class OrderedGroupType diff --git a/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py index 36e90ea9f6..45d73eaacc 100644 --- a/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py @@ -1,12 +1,13 @@ +# pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring def add_AllIndexed(self, elements): if not isinstance(elements, list): elements = [elements] - for element in sorted(elements, key=lambda x : x.index): - if isinstance(element, RegionRefIndexedType): + for element in sorted(elements, key=lambda x: x.index): + if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable self.add_RegionRefIndexed(element) - elif isinstance(element, OrderedGroupIndexedType): + elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable self.add_OrderedGroupIndexed(element) - elif isinstance(element, UnorderedGroupIndexedType): + elif isinstance(element, UnorderedGroupIndexedType): # pylint: disable=undefined-variable self.add_UnorderedGroupIndexed(element) return self.get_AllIndexed() diff --git a/ocrd_models/ocrd_page_user_methods/exportChildren.py b/ocrd_models/ocrd_page_user_methods/exportChildren.py index 9f53f66dde..85c7f660c2 100644 --- a/ocrd_models/ocrd_page_user_methods/exportChildren.py +++ b/ocrd_models/ocrd_page_user_methods/exportChildren.py @@ -1,5 +1,5 @@ -def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): - eol_ = '\n' if pretty_print else '' +# pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring +def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments namespaceprefix_ = 'pc:' if self.UserDefined is not None: self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) @@ -8,8 +8,8 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned = [] # remove emtpy groups and replace with RegionRefIndexedType for entry in self.get_AllIndexed(): - if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): - rri = RegionRefIndexedType.factory(parent_object_=self) + if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): # pylint: disable=undefined-variable + rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable rri.index = entry.index rri.regionRef = entry.regionRef cleaned.append(rri) @@ -17,4 +17,3 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml cleaned.append(entry) for entry in cleaned: entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) - From 8351056a3a40ab39fa4bb9e0a7079e3ba7f6a5dc Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 28 May 2020 17:10:56 +0200 Subject: [PATCH 17/30] add_AllIndexed -> extend_AllIndexed --- ocrd_models/ocrd_models/ocrd_page_generateds.py | 6 +++--- ocrd_models/ocrd_page_user_methods.py | 2 +- .../{add_AllIndexed.py => extend_AllIndexed.py} | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename ocrd_models/ocrd_page_user_methods/{add_AllIndexed.py => extend_AllIndexed.py} (94%) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index a9c0484dc7..ed8697d9fa 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Thu May 28 17:02:23 2020 by generateDS.py version 2.35.20. +# Generated Thu May 28 17:07:14 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -5419,7 +5419,7 @@ def clear_AllIndexed(self): return ret # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring - def add_AllIndexed(self, elements): + def extend_AllIndexed(self, elements): if not isinstance(elements, list): elements = [elements] for element in sorted(elements, key=lambda x: x.index): @@ -6209,7 +6209,7 @@ def clear_AllIndexed(self): return ret # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring - def add_AllIndexed(self, elements): + def extend_AllIndexed(self, elements): if not isinstance(elements, list): elements = [elements] for element in sorted(elements, key=lambda x: x.index): diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 7f93323794..42f5bfa481 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -100,7 +100,7 @@ def _add_method(class_re, method_name): _add_method(r'^.*$', '__hash__'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'get_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'clear_AllIndexed'), - _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'add_AllIndexed'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'extend_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren'), _add_method(r'^(PageType)$', 'get_AllRegions'), ) diff --git a/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py similarity index 94% rename from ocrd_models/ocrd_page_user_methods/add_AllIndexed.py rename to ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py index 45d73eaacc..fb129f5b5f 100644 --- a/ocrd_models/ocrd_page_user_methods/add_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py @@ -1,5 +1,5 @@ # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring -def add_AllIndexed(self, elements): +def extend_AllIndexed(self, elements): if not isinstance(elements, list): elements = [elements] for element in sorted(elements, key=lambda x: x.index): From f2022057c16fe2e3c49f571aee75b21681ebdc75 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 28 May 2020 20:30:41 +0200 Subject: [PATCH 18/30] get_AllRegions: differentiate "reading-order"/"reading-order-only" --- ocrd_models/README.md | 5 +++- .../ocrd_models/ocrd_page_generateds.py | 29 ++++++++++++++----- .../ocrd_page_user_methods/get_AllRegions.py | 27 ++++++++++++----- tests/model/test_ocrd_page.py | 10 +++++-- 4 files changed, 53 insertions(+), 18 deletions(-) diff --git a/ocrd_models/README.md b/ocrd_models/README.md index 7e6ec6216b..36ebf381d0 100644 --- a/ocrd_models/README.md +++ b/ocrd_models/README.md @@ -15,7 +15,10 @@ def get_FirstTextRegion(self): return self.get_TextRegion[0] ``` - (Note that the method name and file name must be identical.) + **NOTE** The method name and file name must be identical. + + **NOTE** Do not use Python's `%` string interpolation operator, it will break generateDS. Use `"".format(...)` instead. + 2. Edit `ocrd_models/ocrd_page_user_methods.py` and append to the `METHOD_SPECS` list: ```python diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index ed8697d9fa..1bc34f8f4c 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Thu May 28 17:07:14 2020 by generateDS.py version 2.35.20. +# Generated Thu May 28 20:28:48 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2860,7 +2860,7 @@ def _get_recursive_regions(self, regions, level, classes=None): if classes: return [r for r in regions if self._region_class(r) in classes] # remove the first element (PageType) - return regions[1:] + return list(set(regions[1:])) # find more regions recursively more_regions = [] for region in regions: @@ -2895,21 +2895,34 @@ def get_AllRegions(self, classes=None, order='document', depth=1): Returned in document order unless ``order`` is ``reading-order`` Arguments: classes (list) Classes of regions that shall be returned, e.g. ``['Text', 'Image']`` - order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order + order ("document"|"reading-order"|"reading-order-only") Whether to + return regions sorted by document order (``document``, default) or by + reading order with regions not in the reading order at the end of the + returned list (``reading-order``) or regions not in the reading order + omitted (``reading-order-only``) depth (int) Recursive depth to look for regions at. Default: 1 """ - if order not in ['document', 'reading-order']: + if order not in ['document', 'reading-order', 'reading-order-only']: raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) - ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) - if order == 'reading-order': + ret = self._get_recursive_regions([self], depth + 1, classes) + if order.startswith('reading-order'): reading_order = self.get_ReadingOrder() if reading_order: reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() if reading_order: reading_order = self._get_recursive_reading_order(reading_order) if reading_order: - id2region = dict([(region.id, region) for region in ret]) # pylint: disable=consider-using-dict-comprehension - ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] + id2region = {region.id: region for region in ret} + in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] + # print("ret: {} / in_ro: {} / not-in-ro: {}".format( + # len(ret), + # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), + # len([r for r in ret if r not in in_reading_order]) + # )) + if order == 'reading-order-only': + ret = in_reading_order + else: + ret = in_reading_order + [r for r in ret if r not in in_reading_order] return ret # end class PageType diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 98a539c90d..8dbf7de135 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -8,7 +8,7 @@ def _get_recursive_regions(self, regions, level, classes=None): if classes: return [r for r in regions if self._region_class(r) in classes] # remove the first element (PageType) - return regions[1:] + return list(set(regions[1:])) # find more regions recursively more_regions = [] for region in regions: @@ -43,19 +43,32 @@ def get_AllRegions(self, classes=None, order='document', depth=1): Returned in document order unless ``order`` is ``reading-order`` Arguments: classes (list) Classes of regions that shall be returned, e.g. ``['Text', 'Image']`` - order ("document"|"reading-order") Whether to return regions sorted by document order (default) or by reading order + order ("document"|"reading-order"|"reading-order-only") Whether to + return regions sorted by document order (``document``, default) or by + reading order with regions not in the reading order at the end of the + returned list (``reading-order``) or regions not in the reading order + omitted (``reading-order-only``) depth (int) Recursive depth to look for regions at. Default: 1 """ - if order not in ['document', 'reading-order']: + if order not in ['document', 'reading-order', 'reading-order-only']: raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) - ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) - if order == 'reading-order': + ret = self._get_recursive_regions([self], depth + 1, classes) + if order.startswith('reading-order'): reading_order = self.get_ReadingOrder() if reading_order: reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() if reading_order: reading_order = self._get_recursive_reading_order(reading_order) if reading_order: - id2region = dict([(region.id, region) for region in ret]) # pylint: disable=consider-using-dict-comprehension - ret = [id2region[region_id] for region_id in reading_order if region_id in id2region] + id2region = {region.id: region for region in ret} + in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] + # print("ret: {} / in_ro: {} / not-in-ro: {}".format( + # len(ret), + # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), + # len([r for r in ret if r not in in_reading_order]) + # )) + if order == 'reading-order-only': + ret = in_reading_order + else: + ret = in_reading_order + [r for r in ret if r not in in_reading_order] return ret diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7ee58e421f..ce36433443 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -193,9 +193,13 @@ def test_all_regions_without_reading_order(self): pcgts = parseString(f.read().encode('utf8'), silence=True) pg = pcgts.get_Page() self.assertEqual(len(pg.get_AllRegions()), 45) + self.assertEqual(len(pg.get_AllRegions(depth=1)), 45) + self.assertEqual(len(pg.get_AllRegions(depth=2)), 65) + self.assertEqual(len(pg.get_AllRegions(depth=3)), 65) self.assertEqual(len(pg.get_AllRegions(classes=['Separator'])), 25) self.assertEqual(len(pg.get_AllRegions(classes=['Table'])), 3) self.assertEqual(len(pg.get_AllRegions(classes=['Text'])), 17) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=2)), 54) def test_all_regions_with_reading_order(self): """ @@ -207,8 +211,10 @@ def test_all_regions_with_reading_order(self): pg = pcgts.get_Page() with self.assertRaisesRegex(Exception, "Argument 'order' must be either 'document' or 'reading-order', not 'random'"): pg.get_AllRegions(order='random') - self.assertEqual(len(pg.get_AllRegions(order='reading-order')), 20) - self.assertEqual(len(pg.get_AllRegions(classes=['Table'], order='reading-order')), 3) + self.assertEqual(len(pg.get_AllRegions(order='reading-order-only')), 20) + self.assertEqual(len(pg.get_AllRegions(order='reading-order-only', depth=2)), 40) + self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=1)), 45) + self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=2)), 65) self.assertEqual(len(pg.get_AllRegions(classes=['Table'], order='reading-order')), 3) self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order')), 17) From ffba6f937dcb6418104025fad8fe8dfaa10ce25c Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 13:40:22 +0200 Subject: [PATCH 19/30] get_AllRegions: catch negative depth, test depth==0 --- ocrd_models/ocrd_models/ocrd_page_generateds.py | 6 ++++-- ocrd_models/ocrd_page_user_methods/get_AllRegions.py | 4 +++- tests/model/test_ocrd_page.py | 9 ++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 1bc34f8f4c..a1aa2f976c 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Thu May 28 20:28:48 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 13:39:11 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2903,7 +2903,9 @@ def get_AllRegions(self, classes=None, order='document', depth=1): depth (int) Recursive depth to look for regions at. Default: 1 """ if order not in ['document', 'reading-order', 'reading-order-only']: - raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) + raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) + if depth < 0: + raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) ret = self._get_recursive_regions([self], depth + 1, classes) if order.startswith('reading-order'): reading_order = self.get_ReadingOrder() diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 8dbf7de135..90d4a193f6 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -51,7 +51,9 @@ def get_AllRegions(self, classes=None, order='document', depth=1): depth (int) Recursive depth to look for regions at. Default: 1 """ if order not in ['document', 'reading-order', 'reading-order-only']: - raise Exception("Argument 'order' must be either 'document' or 'reading-order', not '{}'".format(order)) + raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) + if depth < 0: + raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) ret = self._get_recursive_regions([self], depth + 1, classes) if order.startswith('reading-order'): reading_order = self.get_ReadingOrder() diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index ce36433443..0e32cbeaff 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -193,6 +193,7 @@ def test_all_regions_without_reading_order(self): pcgts = parseString(f.read().encode('utf8'), silence=True) pg = pcgts.get_Page() self.assertEqual(len(pg.get_AllRegions()), 45) + self.assertEqual(len(pg.get_AllRegions(depth=0)), 0) self.assertEqual(len(pg.get_AllRegions(depth=1)), 45) self.assertEqual(len(pg.get_AllRegions(depth=2)), 65) self.assertEqual(len(pg.get_AllRegions(depth=3)), 65) @@ -207,12 +208,14 @@ def test_all_regions_with_reading_order(self): https://github.com/OCR-D/core/issues/240#issuecomment-493135797 """ with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: - pcgts = parseString(f.read().encode('utf8'), silence=True) - pg = pcgts.get_Page() - with self.assertRaisesRegex(Exception, "Argument 'order' must be either 'document' or 'reading-order', not 'random'"): + pg = parseString(f.read().encode('utf8'), silence=True).get_Page() + with self.assertRaisesRegex(Exception, "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'"): pg.get_AllRegions(order='random') + with self.assertRaisesRegex(Exception, "Argument 'depth' must be an integer greater-or-equal 0, not '-1'"): + pg.get_AllRegions(depth=-1) self.assertEqual(len(pg.get_AllRegions(order='reading-order-only')), 20) self.assertEqual(len(pg.get_AllRegions(order='reading-order-only', depth=2)), 40) + self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=0)), 0) self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=1)), 45) self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=2)), 65) self.assertEqual(len(pg.get_AllRegions(classes=['Table'], order='reading-order')), 3) From 207f396c2fe4cf3d32f5456f74a01c64e7a5ac17 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 29 May 2020 14:46:43 +0200 Subject: [PATCH 20/30] :memo: get_AllRegions: document example --- ocrd_models/ocrd_models/ocrd_page_generateds.py | 8 +++++++- ocrd_models/ocrd_page_user_methods/get_AllRegions.py | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index a1aa2f976c..2bf8827c3d 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 29 13:39:11 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 14:45:59 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2901,6 +2901,12 @@ def get_AllRegions(self, classes=None, order='document', depth=1): returned list (``reading-order``) or regions not in the reading order omitted (``reading-order-only``) depth (int) Recursive depth to look for regions at. Default: 1 + + For example, to get all text anywhere on the page in reading order, use: + :: + '\n'.join(line.get_TextEquiv()[0].Unicode + for region in page.get_AllRegions(classes='Text', depth=0, order='reading-order') + for line in region.get_TextLine()) """ if order not in ['document', 'reading-order', 'reading-order-only']: raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 90d4a193f6..4a4dacbf9a 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -49,6 +49,12 @@ def get_AllRegions(self, classes=None, order='document', depth=1): returned list (``reading-order``) or regions not in the reading order omitted (``reading-order-only``) depth (int) Recursive depth to look for regions at. Default: 1 + + For example, to get all text anywhere on the page in reading order, use: + :: + '\n'.join(line.get_TextEquiv()[0].Unicode + for region in page.get_AllRegions(classes='Text', depth=0, order='reading-order') + for line in region.get_TextLine()) """ if order not in ['document', 'reading-order', 'reading-order-only']: raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) From 9ced31514599a87bfabf8c2a0accf28e26747b21 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 16:34:54 +0200 Subject: [PATCH 21/30] get_AllRegions: fix recursion --- .../ocrd_models/ocrd_page_generateds.py | 22 +++++++++++-------- .../ocrd_page_user_methods/get_AllRegions.py | 20 ++++++++++------- tests/model/test_ocrd_page.py | 17 ++++++++------ 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 2bf8827c3d..ab581a4ab4 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 29 14:45:59 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 16:34:32 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2859,8 +2859,9 @@ def _get_recursive_regions(self, regions, level, classes=None): # stop recursion, filter classes if classes: return [r for r in regions if self._region_class(r) in classes] - # remove the first element (PageType) - return list(set(regions[1:])) + if regions and regions[0].__class__.__name__ == 'PageType': + regions = regions[1:] + return regions # find more regions recursively more_regions = [] for region in regions: @@ -2873,9 +2874,12 @@ def _get_recursive_regions(self, regions, level, classes=None): continue more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() if not any(more_regions): - return _get_recursive_regions(regions, 1, classes) - regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] - return self._get_recursive_regions(regions, level - 1 if level else 0, classes) + return self._get_recursive_regions(regions, 1, classes) + ret = [] + for r, more in zip(regions, more_regions): + ret.append(r) + ret += self._get_recursive_regions(more, level - 1 if level else 0, classes) + return self._get_recursive_regions(ret, 1, classes) def _get_recursive_reading_order(self, rogroup): if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable @@ -2889,7 +2893,7 @@ def _get_recursive_reading_order(self, rogroup): regionrefs.extend(self._get_recursive_reading_order(elem)) return regionrefs - def get_AllRegions(self, classes=None, order='document', depth=1): + def get_AllRegions(self, classes=None, order='document', depth=0): """ Get all the *Region element or only those provided by ``classes``. Returned in document order unless ``order`` is ``reading-order`` @@ -2900,7 +2904,7 @@ def get_AllRegions(self, classes=None, order='document', depth=1): reading order with regions not in the reading order at the end of the returned list (``reading-order``) or regions not in the reading order omitted (``reading-order-only``) - depth (int) Recursive depth to look for regions at. Default: 1 + depth (int) Recursive depth to look for regions at, set to `0` for all regions at any depth. Default: 0 For example, to get all text anywhere on the page in reading order, use: :: @@ -2912,7 +2916,7 @@ def get_AllRegions(self, classes=None, order='document', depth=1): raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) if depth < 0: raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) - ret = self._get_recursive_regions([self], depth + 1, classes) + ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) if order.startswith('reading-order'): reading_order = self.get_ReadingOrder() if reading_order: diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 4a4dacbf9a..25c1d2c69e 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -7,8 +7,9 @@ def _get_recursive_regions(self, regions, level, classes=None): # stop recursion, filter classes if classes: return [r for r in regions if self._region_class(r) in classes] - # remove the first element (PageType) - return list(set(regions[1:])) + if regions and regions[0].__class__.__name__ == 'PageType': + regions = regions[1:] + return regions # find more regions recursively more_regions = [] for region in regions: @@ -21,9 +22,12 @@ def _get_recursive_regions(self, regions, level, classes=None): continue more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() if not any(more_regions): - return _get_recursive_regions(regions, 1, classes) - regions = [region for r, more in zip(regions, more_regions) for region in [r] + more] - return self._get_recursive_regions(regions, level - 1 if level else 0, classes) + return self._get_recursive_regions(regions, 1, classes) + ret = [] + for r, more in zip(regions, more_regions): + ret.append(r) + ret += self._get_recursive_regions(more, level - 1 if level else 0, classes) + return self._get_recursive_regions(ret, 1, classes) def _get_recursive_reading_order(self, rogroup): if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable @@ -37,7 +41,7 @@ def _get_recursive_reading_order(self, rogroup): regionrefs.extend(self._get_recursive_reading_order(elem)) return regionrefs -def get_AllRegions(self, classes=None, order='document', depth=1): +def get_AllRegions(self, classes=None, order='document', depth=0): """ Get all the *Region element or only those provided by ``classes``. Returned in document order unless ``order`` is ``reading-order`` @@ -48,7 +52,7 @@ def get_AllRegions(self, classes=None, order='document', depth=1): reading order with regions not in the reading order at the end of the returned list (``reading-order``) or regions not in the reading order omitted (``reading-order-only``) - depth (int) Recursive depth to look for regions at. Default: 1 + depth (int) Recursive depth to look for regions at, set to `0` for all regions at any depth. Default: 0 For example, to get all text anywhere on the page in reading order, use: :: @@ -60,7 +64,7 @@ def get_AllRegions(self, classes=None, order='document', depth=1): raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) if depth < 0: raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) - ret = self._get_recursive_regions([self], depth + 1, classes) + ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) if order.startswith('reading-order'): reading_order = self.get_ReadingOrder() if reading_order: diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 0e32cbeaff..5a950d5149 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -192,15 +192,16 @@ def test_all_regions_without_reading_order(self): with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) pg = pcgts.get_Page() - self.assertEqual(len(pg.get_AllRegions()), 45) - self.assertEqual(len(pg.get_AllRegions(depth=0)), 0) + self.assertEqual(len(pg.get_AllRegions()), 65) + self.assertEqual(len(pg.get_AllRegions(depth=0)), 65) self.assertEqual(len(pg.get_AllRegions(depth=1)), 45) self.assertEqual(len(pg.get_AllRegions(depth=2)), 65) self.assertEqual(len(pg.get_AllRegions(depth=3)), 65) self.assertEqual(len(pg.get_AllRegions(classes=['Separator'])), 25) self.assertEqual(len(pg.get_AllRegions(classes=['Table'])), 3) - self.assertEqual(len(pg.get_AllRegions(classes=['Text'])), 17) - self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=2)), 54) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'])), 37) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=1)), 17) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'], depth=2)), 37) def test_all_regions_with_reading_order(self): """ @@ -213,13 +214,15 @@ def test_all_regions_with_reading_order(self): pg.get_AllRegions(order='random') with self.assertRaisesRegex(Exception, "Argument 'depth' must be an integer greater-or-equal 0, not '-1'"): pg.get_AllRegions(depth=-1) - self.assertEqual(len(pg.get_AllRegions(order='reading-order-only')), 20) + self.assertEqual(len(pg.get_AllRegions(order='reading-order-only')), 40) + self.assertEqual(len(pg.get_AllRegions(order='reading-order-only', depth=1)), 20) self.assertEqual(len(pg.get_AllRegions(order='reading-order-only', depth=2)), 40) - self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=0)), 0) + self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=0)), 65) self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=1)), 45) self.assertEqual(len(pg.get_AllRegions(order='reading-order', depth=2)), 65) self.assertEqual(len(pg.get_AllRegions(classes=['Table'], order='reading-order')), 3) - self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order')), 17) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order')), 37) + self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)), 17) if __name__ == '__main__': main() From 629f38d2ce45bac6887c3ac3a8db47cc85a53b15 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 29 May 2020 19:58:28 +0200 Subject: [PATCH 22/30] get_AllRegions: Update example Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/ocrd_page_user_methods/get_AllRegions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py index 25c1d2c69e..bae63bef66 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllRegions.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllRegions.py @@ -56,8 +56,8 @@ def get_AllRegions(self, classes=None, order='document', depth=0): For example, to get all text anywhere on the page in reading order, use: :: - '\n'.join(line.get_TextEquiv()[0].Unicode - for region in page.get_AllRegions(classes='Text', depth=0, order='reading-order') + '\\n'.join(line.get_TextEquiv()[0].Unicode + for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') for line in region.get_TextLine()) """ if order not in ['document', 'reading-order', 'reading-order-only']: From e958559fbdf702347a0ed870e9df7b625b59a5e0 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 21:54:54 +0200 Subject: [PATCH 23/30] wip --- .../ocrd_models/ocrd_page_generateds.py | 54 +++++++++++++++---- .../extend_AllIndexed.py | 5 +- .../ocrd_page_user_methods/get_AllIndexed.py | 19 ++++++- tests/model/TEMP1_Gutachten2-2.xml | 3 ++ tests/model/test_ocrd_page.py | 9 ++++ 5 files changed, 78 insertions(+), 12 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index ab581a4ab4..1bc292bc15 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 29 16:34:32 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 21:46:01 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2908,8 +2908,8 @@ def get_AllRegions(self, classes=None, order='document', depth=0): For example, to get all text anywhere on the page in reading order, use: :: - '\n'.join(line.get_TextEquiv()[0].Unicode - for region in page.get_AllRegions(classes='Text', depth=0, order='reading-order') + '\\n'.join(line.get_TextEquiv()[0].Unicode + for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') for line in region.get_TextLine()) """ if order not in ['document', 'reading-order', 'reading-order-only']: @@ -5433,9 +5433,24 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroupIndexed' def __hash__(self): return hash(self.id) - def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + # pylint: disable=invalid-name,missing-module-docstring,line-too-long + def get_AllIndexed(self, classes=None): + """ + Get all indexed children sorted by their ``@index``. + Arguments: + classes (list): Type of children to return. Default: ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + """ + if not classes: + classes = ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + ret = [] + if 'RegionRef' in classes: + ret += self.get_RegionRefIndexed() + if 'OrderedGroup' in classes: + ret += self.get_OrderedGroupIndexed() + if 'UnorderedGroup' in classes: + ret += self.get_UnorderedGroupIndexed() + return sorted(ret, key=lambda x: x.index) def clear_AllIndexed(self): ret = self.get_AllIndexed() self.set_RegionRefIndexed([]) @@ -5443,8 +5458,11 @@ def clear_AllIndexed(self): self.set_UnorderedGroupIndexed([]) return ret - # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring + # pylint: disable=line-too-long,invalid-name,missing-module-docstring def extend_AllIndexed(self, elements): + """ + Add all elements in list ``elements``, respecting ``@index`` order. + """ if not isinstance(elements, list): elements = [elements] for element in sorted(elements, key=lambda x: x.index): @@ -6223,9 +6241,24 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroupIndexed' def __hash__(self): return hash(self.id) - def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + # pylint: disable=invalid-name,missing-module-docstring,line-too-long + def get_AllIndexed(self, classes=None): + """ + Get all indexed children sorted by their ``@index``. + Arguments: + classes (list): Type of children to return. Default: ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + """ + if not classes: + classes = ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + ret = [] + if 'RegionRef' in classes: + ret += self.get_RegionRefIndexed() + if 'OrderedGroup' in classes: + ret += self.get_OrderedGroupIndexed() + if 'UnorderedGroup' in classes: + ret += self.get_UnorderedGroupIndexed() + return sorted(ret, key=lambda x: x.index) def clear_AllIndexed(self): ret = self.get_AllIndexed() self.set_RegionRefIndexed([]) @@ -6233,8 +6266,11 @@ def clear_AllIndexed(self): self.set_UnorderedGroupIndexed([]) return ret - # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring + # pylint: disable=line-too-long,invalid-name,missing-module-docstring def extend_AllIndexed(self, elements): + """ + Add all elements in list ``elements``, respecting ``@index`` order. + """ if not isinstance(elements, list): elements = [elements] for element in sorted(elements, key=lambda x: x.index): diff --git a/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py index fb129f5b5f..e364a8cc11 100644 --- a/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py @@ -1,5 +1,8 @@ -# pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring +# pylint: disable=line-too-long,invalid-name,missing-module-docstring def extend_AllIndexed(self, elements): + """ + Add all elements in list ``elements``, respecting ``@index`` order. + """ if not isinstance(elements, list): elements = [elements] for element in sorted(elements, key=lambda x: x.index): diff --git a/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py index 4d79632353..6618b6bf3e 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py @@ -1,3 +1,18 @@ -def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) +# pylint: disable=invalid-name,missing-module-docstring,line-too-long +def get_AllIndexed(self, classes=None): + """ + Get all indexed children sorted by their ``@index``. + Arguments: + classes (list): Type of children to return. Default: ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + """ + if not classes: + classes = ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + ret = [] + if 'RegionRef' in classes: + ret += self.get_RegionRefIndexed() + if 'OrderedGroup' in classes: + ret += self.get_OrderedGroupIndexed() + if 'UnorderedGroup' in classes: + ret += self.get_UnorderedGroupIndexed() + return sorted(ret, key=lambda x: x.index) diff --git a/tests/model/TEMP1_Gutachten2-2.xml b/tests/model/TEMP1_Gutachten2-2.xml index f0d7cbd33b..581af560d9 100644 --- a/tests/model/TEMP1_Gutachten2-2.xml +++ b/tests/model/TEMP1_Gutachten2-2.xml @@ -117,6 +117,9 @@ + + + diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 5a950d5149..d795aa7c0b 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -224,5 +224,14 @@ def test_all_regions_with_reading_order(self): self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order')), 37) self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)), 17) + def test_get_AllIndexed_classes(self): + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() + self.assertEqual(len(og.get_AllIndexed(classes=['RegionRef'])), 17) + self.assertEqual(len(og.get_AllIndexed(classes=['OrderedGroup'])), 3) + self.assertEqual(len(og.get_AllIndexed(classes=['UnorderedGroup'])), 1) + + + if __name__ == '__main__': main() From 196456386e8e656ad6c463379ba8085d42ac3c8b Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 22:35:46 +0200 Subject: [PATCH 24/30] reading order test sample: add unorderedgroups for testing --- tests/model/TEMP1_Gutachten2-2.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/model/TEMP1_Gutachten2-2.xml b/tests/model/TEMP1_Gutachten2-2.xml index f0d7cbd33b..1f54b4db9a 100644 --- a/tests/model/TEMP1_Gutachten2-2.xml +++ b/tests/model/TEMP1_Gutachten2-2.xml @@ -117,6 +117,11 @@ + + + + + From 27e256f52165763b4a4d02d93bac6ac0a61f619c Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 22:36:48 +0200 Subject: [PATCH 25/30] add get_UnorderedGroupChildren, let get_AllIndexed handle UnorderedGroupIndexed properly --- .../ocrd_models/ocrd_page_generateds.py | 54 +++++++++++++------ ocrd_models/ocrd_page_user_methods.py | 1 + .../ocrd_page_user_methods/exportChildren.py | 15 ++++-- .../ocrd_page_user_methods/get_AllIndexed.py | 2 +- .../get_UnorderedGroupChildren.py | 7 +++ tests/model/test_ocrd_page.py | 20 +++++-- 6 files changed, 73 insertions(+), 26 deletions(-) create mode 100644 ocrd_models/ocrd_page_user_methods/get_UnorderedGroupChildren.py diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index ab581a4ab4..4878390c5a 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 29 16:34:32 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 22:34:55 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -2908,8 +2908,8 @@ def get_AllRegions(self, classes=None, order='document', depth=0): For example, to get all text anywhere on the page in reading order, use: :: - '\n'.join(line.get_TextEquiv()[0].Unicode - for region in page.get_AllRegions(classes='Text', depth=0, order='reading-order') + '\\n'.join(line.get_TextEquiv()[0].Unicode + for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') for line in region.get_TextLine()) """ if order not in ['document', 'reading-order', 'reading-order-only']: @@ -5434,7 +5434,7 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec def __hash__(self): return hash(self.id) def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x: x.index) def clear_AllIndexed(self): ret = self.get_AllIndexed() @@ -5464,13 +5464,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] + def replaceWithRRI(group): + rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable + rri.index = group.index + rri.regionRef = group.regionRef + cleaned.append(rri) # remove emtpy groups and replace with RegionRefIndexedType for entry in self.get_AllIndexed(): - if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): # pylint: disable=undefined-variable - rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable - rri.index = entry.index - rri.regionRef = entry.regionRef - cleaned.append(rri) + # pylint: disable=undefined-variable + if isinstance(entry, (OrderedGroupIndexedType)) and not entry.get_AllIndexed(): + replaceWithRRI(entry) + elif isinstance(entry, UnorderedGroupIndexedType) and not entry.get_UnorderedGroupChildren(): + replaceWithRRI(entry) else: cleaned.append(entry) for entry in cleaned: @@ -5811,6 +5816,13 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroup' def __hash__(self): return hash(self.id) + def get_UnorderedGroupChildren(self): + """ + List all non-metadata children of an UnorderedGroup + """ + # TODO: should not change order + return self.get_RegionRef() + self.get_OrderedGroup() + self.get_UnorderedGroup() + # end class UnorderedGroupIndexedType @@ -6224,7 +6236,7 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec def __hash__(self): return hash(self.id) def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x: x.index) def clear_AllIndexed(self): ret = self.get_AllIndexed() @@ -6254,13 +6266,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] + def replaceWithRRI(group): + rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable + rri.index = group.index + rri.regionRef = group.regionRef + cleaned.append(rri) # remove emtpy groups and replace with RegionRefIndexedType for entry in self.get_AllIndexed(): - if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): # pylint: disable=undefined-variable - rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable - rri.index = entry.index - rri.regionRef = entry.regionRef - cleaned.append(rri) + # pylint: disable=undefined-variable + if isinstance(entry, (OrderedGroupIndexedType)) and not entry.get_AllIndexed(): + replaceWithRRI(entry) + elif isinstance(entry, UnorderedGroupIndexedType) and not entry.get_UnorderedGroupChildren(): + replaceWithRRI(entry) else: cleaned.append(entry) for entry in cleaned: @@ -6585,6 +6602,13 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.original_tagname_ = 'UnorderedGroup' def __hash__(self): return hash(self.id) + def get_UnorderedGroupChildren(self): + """ + List all non-metadata children of an UnorderedGroup + """ + # TODO: should not change order + return self.get_RegionRef() + self.get_OrderedGroup() + self.get_UnorderedGroup() + # end class UnorderedGroupType diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 42f5bfa481..59ca85b7e0 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -102,6 +102,7 @@ def _add_method(class_re, method_name): _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'clear_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'extend_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren'), + _add_method(r'^(UnorderedGroupType|UnorderedGroupIndexedType)$', 'get_UnorderedGroupChildren'), _add_method(r'^(PageType)$', 'get_AllRegions'), ) diff --git a/ocrd_models/ocrd_page_user_methods/exportChildren.py b/ocrd_models/ocrd_page_user_methods/exportChildren.py index 85c7f660c2..8f118b71be 100644 --- a/ocrd_models/ocrd_page_user_methods/exportChildren.py +++ b/ocrd_models/ocrd_page_user_methods/exportChildren.py @@ -6,13 +6,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] + def replaceWithRRI(group): + rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable + rri.index = group.index + rri.regionRef = group.regionRef + cleaned.append(rri) # remove emtpy groups and replace with RegionRefIndexedType for entry in self.get_AllIndexed(): - if isinstance(entry, (UnorderedGroupIndexedType, OrderedGroupIndexedType)) and not entry.get_AllIndexed(): # pylint: disable=undefined-variable - rri = RegionRefIndexedType.factory(parent_object_=self) # pylint: disable=undefined-variable - rri.index = entry.index - rri.regionRef = entry.regionRef - cleaned.append(rri) + # pylint: disable=undefined-variable + if isinstance(entry, (OrderedGroupIndexedType)) and not entry.get_AllIndexed(): + replaceWithRRI(entry) + elif isinstance(entry, UnorderedGroupIndexedType) and not entry.get_UnorderedGroupChildren(): + replaceWithRRI(entry) else: cleaned.append(entry) for entry in cleaned: diff --git a/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py index 4d79632353..7f2a5434b4 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py @@ -1,3 +1,3 @@ def get_AllIndexed(self): - return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x : x.index) + return sorted(self.get_RegionRefIndexed() + self.get_OrderedGroupIndexed() + self.get_UnorderedGroupIndexed(), key=lambda x: x.index) diff --git a/ocrd_models/ocrd_page_user_methods/get_UnorderedGroupChildren.py b/ocrd_models/ocrd_page_user_methods/get_UnorderedGroupChildren.py new file mode 100644 index 0000000000..61c1d9450f --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/get_UnorderedGroupChildren.py @@ -0,0 +1,7 @@ +def get_UnorderedGroupChildren(self): + """ + List all non-metadata children of an UnorderedGroup + """ + # TODO: should not change order + return self.get_RegionRef() + self.get_OrderedGroup() + self.get_UnorderedGroup() + diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 5a950d5149..34d79b5f4a 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -8,6 +8,7 @@ TextRegionType, TextLineType, OrderedGroupIndexedType, + UnorderedGroupIndexedType, RegionRefIndexedType, WordType, GlyphType, @@ -151,19 +152,19 @@ def test_orderedgroup_export_order(self): og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() xml_before = to_xml(og) children = og.get_AllIndexed() - self.assertEqual(len(children), 20) - self.assertEqual([c.index for c in children], list(range(0, 20))) + self.assertEqual(len(children), 22) + self.assertEqual([c.index for c in children], list(range(0, 22))) # mix up the indexes children[0].index = 11 children[11].index = 3 children[3].index = 0 - self.assertEqual([c.index for c in children], [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19]) - self.assertEqual([c.index for c in og.get_AllIndexed()], list(range(0, 20))) + self.assertEqual([c.index for c in children], [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]) + self.assertEqual([c.index for c in og.get_AllIndexed()], list(range(0, 22))) self.assertEqual(og.get_AllIndexed()[1].__class__, OrderedGroupIndexedType) # serialize and make sure the correct order was serialized new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() - self.assertEqual([c.index for c in new_og.get_AllIndexed()], list(range(0, 20))) + self.assertEqual([c.index for c in new_og.get_AllIndexed()], list(range(0, 22))) # xml_after = to_xml(new_og) # self.assertEqual(xml_after, xml_before) @@ -176,6 +177,7 @@ def test_empty_groups_to_regionrefindexed(self): og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], OrderedGroupIndexedType)) + self.assertTrue(isinstance(children[21], UnorderedGroupIndexedType)) # empty all the elements in the first orederdGroupIndexed children[1].set_RegionRefIndexed([]) # serialize apnd parse to see empty group converted @@ -183,6 +185,7 @@ def test_empty_groups_to_regionrefindexed(self): og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], RegionRefIndexedType)) + self.assertTrue(isinstance(children[21], RegionRefIndexedType)) def test_all_regions_without_reading_order(self): """ @@ -224,5 +227,12 @@ def test_all_regions_with_reading_order(self): self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order')), 37) self.assertEqual(len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)), 17) + def test_get_UnorderdGroupChildren(self): + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + pcgts = parseString(f.read().encode('utf8'), silence=True) + ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup().get_UnorderedGroupIndexed()[0] + self.assertEqual(len(ug.get_UnorderedGroupChildren()), 1) + + if __name__ == '__main__': main() From ae613cfb757319e954839c74cef709359c49dcf0 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 23:04:44 +0200 Subject: [PATCH 26/30] get_AllIndexed: index_sort parameter to enable/disable sorting --- ocrd_models/ocrd_models/ocrd_page_generateds.py | 16 +++++++++++----- .../ocrd_page_user_methods/get_AllIndexed.py | 7 +++++-- tests/model/test_ocrd_page.py | 10 ++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index ca2209571c..7ed0d71494 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 29 22:47:44 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 23:03:43 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -5434,19 +5434,22 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec def __hash__(self): return hash(self.id) # pylint: disable=invalid-name,missing-module-docstring,line-too-long - def get_AllIndexed(self, classes=None): + def get_AllIndexed(self, classes=None, index_sort=True): """ Get all indexed children sorted by their ``@index``. Arguments: classes (list): Type of children to return. Default: ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + index_sort (boolean): Whether to sort by ``@index`` """ if not classes: classes = ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] ret = [] for class_ in classes: ret += getattr(self, 'get_{}Indexed'.format(class_))() - return sorted(ret, key=lambda x: x.index) + if index_sort: + return sorted(ret, key=lambda x: x.index) + return ret def clear_AllIndexed(self): ret = self.get_AllIndexed() self.set_RegionRefIndexed([]) @@ -6250,19 +6253,22 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec def __hash__(self): return hash(self.id) # pylint: disable=invalid-name,missing-module-docstring,line-too-long - def get_AllIndexed(self, classes=None): + def get_AllIndexed(self, classes=None, index_sort=True): """ Get all indexed children sorted by their ``@index``. Arguments: classes (list): Type of children to return. Default: ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + index_sort (boolean): Whether to sort by ``@index`` """ if not classes: classes = ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] ret = [] for class_ in classes: ret += getattr(self, 'get_{}Indexed'.format(class_))() - return sorted(ret, key=lambda x: x.index) + if index_sort: + return sorted(ret, key=lambda x: x.index) + return ret def clear_AllIndexed(self): ret = self.get_AllIndexed() self.set_RegionRefIndexed([]) diff --git a/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py index 6fa157cdfd..da660d9893 100644 --- a/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/get_AllIndexed.py @@ -1,14 +1,17 @@ # pylint: disable=invalid-name,missing-module-docstring,line-too-long -def get_AllIndexed(self, classes=None): +def get_AllIndexed(self, classes=None, index_sort=True): """ Get all indexed children sorted by their ``@index``. Arguments: classes (list): Type of children to return. Default: ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] + index_sort (boolean): Whether to sort by ``@index`` """ if not classes: classes = ['RegionRef', 'OrderedGroup', 'UnorderedGroup'] ret = [] for class_ in classes: ret += getattr(self, 'get_{}Indexed'.format(class_))() - return sorted(ret, key=lambda x: x.index) + if index_sort: + return sorted(ret, key=lambda x: x.index) + return ret diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 9362f58d9b..7aaef47eda 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -240,5 +240,15 @@ def test_get_AllIndexed_classes(self): self.assertEqual(len(og.get_AllIndexed(classes=['OrderedGroup'])), 3) self.assertEqual(len(og.get_AllIndexed(classes=['UnorderedGroup'])), 2) + def test_get_AllIndexed_index_sort(self): + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() + unogs = og.get_UnorderedGroupIndexed() + self.assertEqual([x.index for x in unogs], [20, 21]) + unogs[0].index = 21 + unogs[1].index = 20 + self.assertEqual([x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True)], [20, 21]) + self.assertEqual([x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)], [21, 20]) + if __name__ == '__main__': main() From b1df95fccf079a9ee957e82dca84bd94162f2797 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 23:08:27 +0200 Subject: [PATCH 27/30] add sort_AllIndexed to sort in-place --- .../ocrd_models/ocrd_page_generateds.py | 34 ++++++++++++++++++- ocrd_models/ocrd_page_user_methods.py | 1 + .../ocrd_page_user_methods/sort_AllIndexed.py | 16 +++++++++ tests/model/test_ocrd_page.py | 2 ++ 4 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 ocrd_models/ocrd_page_user_methods/sort_AllIndexed.py diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 7ed0d71494..1e27dd5768 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 29 23:03:43 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 23:08:05 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -5473,6 +5473,22 @@ def extend_AllIndexed(self, elements): self.add_UnorderedGroupIndexed(element) return self.get_AllIndexed() + # pylint: disable=line-too-long,invalid-name,missing-module-docstring + def sort_AllIndexed(self, validate_uniqueness=True): + """ + Sort all indexed children in-place. + """ + elements = self.get_AllIndexed(index_sort=True) + self.clear_AllIndexed() + for element in elements: + if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): # pylint: disable=undefined-variable + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments namespaceprefix_ = 'pc:' @@ -6292,6 +6308,22 @@ def extend_AllIndexed(self, elements): self.add_UnorderedGroupIndexed(element) return self.get_AllIndexed() + # pylint: disable=line-too-long,invalid-name,missing-module-docstring + def sort_AllIndexed(self, validate_uniqueness=True): + """ + Sort all indexed children in-place. + """ + elements = self.get_AllIndexed(index_sort=True) + self.clear_AllIndexed() + for element in elements: + if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): # pylint: disable=undefined-variable + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments namespaceprefix_ = 'pc:' diff --git a/ocrd_models/ocrd_page_user_methods.py b/ocrd_models/ocrd_page_user_methods.py index 59ca85b7e0..27714c408a 100644 --- a/ocrd_models/ocrd_page_user_methods.py +++ b/ocrd_models/ocrd_page_user_methods.py @@ -101,6 +101,7 @@ def _add_method(class_re, method_name): _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'get_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'clear_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'extend_AllIndexed'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'sort_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren'), _add_method(r'^(UnorderedGroupType|UnorderedGroupIndexedType)$', 'get_UnorderedGroupChildren'), _add_method(r'^(PageType)$', 'get_AllRegions'), diff --git a/ocrd_models/ocrd_page_user_methods/sort_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/sort_AllIndexed.py new file mode 100644 index 0000000000..eeeaf32a7c --- /dev/null +++ b/ocrd_models/ocrd_page_user_methods/sort_AllIndexed.py @@ -0,0 +1,16 @@ +# pylint: disable=line-too-long,invalid-name,missing-module-docstring +def sort_AllIndexed(self, validate_uniqueness=True): + """ + Sort all indexed children in-place. + """ + elements = self.get_AllIndexed(index_sort=True) + self.clear_AllIndexed() + for element in elements: + if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable + self.add_RegionRefIndexed(element) + elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable + self.add_OrderedGroupIndexed(element) + elif isinstance(element, UnorderedGroupIndexedType): # pylint: disable=undefined-variable + self.add_UnorderedGroupIndexed(element) + return self.get_AllIndexed() + diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7aaef47eda..ecd094e58c 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -249,6 +249,8 @@ def test_get_AllIndexed_index_sort(self): unogs[1].index = 20 self.assertEqual([x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True)], [20, 21]) self.assertEqual([x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)], [21, 20]) + og.sort_AllIndexed() + self.assertEqual([x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)], [20, 21]) if __name__ == '__main__': main() From fd9dc833017c464ce962481d52a07b81d414cbec Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 23:30:34 +0200 Subject: [PATCH 28/30] extend_AllIndexed: increment @index when adding elements --- .../ocrd_models/ocrd_page_generateds.py | 32 ++++++++++++++++--- .../extend_AllIndexed.py | 15 +++++++-- tests/model/test_ocrd_page.py | 21 ++++++++++++ 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_page_generateds.py b/ocrd_models/ocrd_models/ocrd_page_generateds.py index 1e27dd5768..46af7ebd28 100644 --- a/ocrd_models/ocrd_models/ocrd_page_generateds.py +++ b/ocrd_models/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Fri May 29 23:08:05 2020 by generateDS.py version 2.35.20. +# Generated Fri May 29 23:29:23 2020 by generateDS.py version 2.35.20. # Python 3.6.9 (default, Apr 18 2020, 01:56:04) [GCC 8.4.0] # # Command line options: @@ -5458,13 +5458,24 @@ def clear_AllIndexed(self): return ret # pylint: disable=line-too-long,invalid-name,missing-module-docstring - def extend_AllIndexed(self, elements): + def extend_AllIndexed(self, elements, validate_continuity=False): """ Add all elements in list ``elements``, respecting ``@index`` order. """ if not isinstance(elements, list): elements = [elements] - for element in sorted(elements, key=lambda x: x.index): + siblings = self.get_AllIndexed() + highest_sibling_index = siblings[-1].index if siblings else -1 + if validate_continuity: + elements = sorted(elements, key=lambda x: x.index) + lowest_element_index = elements[0].index + if lowest_element_index <= highest_sibling_index: + raise Exception("@index already used: {}".format(lowest_element_index)) + else: + for element in elements: + highest_sibling_index += 1 + element.index = highest_sibling_index + for element in elements: if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable self.add_RegionRefIndexed(element) elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable @@ -6293,13 +6304,24 @@ def clear_AllIndexed(self): return ret # pylint: disable=line-too-long,invalid-name,missing-module-docstring - def extend_AllIndexed(self, elements): + def extend_AllIndexed(self, elements, validate_continuity=False): """ Add all elements in list ``elements``, respecting ``@index`` order. """ if not isinstance(elements, list): elements = [elements] - for element in sorted(elements, key=lambda x: x.index): + siblings = self.get_AllIndexed() + highest_sibling_index = siblings[-1].index if siblings else -1 + if validate_continuity: + elements = sorted(elements, key=lambda x: x.index) + lowest_element_index = elements[0].index + if lowest_element_index <= highest_sibling_index: + raise Exception("@index already used: {}".format(lowest_element_index)) + else: + for element in elements: + highest_sibling_index += 1 + element.index = highest_sibling_index + for element in elements: if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable self.add_RegionRefIndexed(element) elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable diff --git a/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py index e364a8cc11..0eba30d1ea 100644 --- a/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py @@ -1,11 +1,22 @@ # pylint: disable=line-too-long,invalid-name,missing-module-docstring -def extend_AllIndexed(self, elements): +def extend_AllIndexed(self, elements, validate_continuity=False): """ Add all elements in list ``elements``, respecting ``@index`` order. """ if not isinstance(elements, list): elements = [elements] - for element in sorted(elements, key=lambda x: x.index): + siblings = self.get_AllIndexed() + highest_sibling_index = siblings[-1].index if siblings else -1 + if validate_continuity: + elements = sorted(elements, key=lambda x: x.index) + lowest_element_index = elements[0].index + if lowest_element_index <= highest_sibling_index: + raise Exception("@index already used: {}".format(lowest_element_index)) + else: + for element in elements: + highest_sibling_index += 1 + element.index = highest_sibling_index + for element in elements: if isinstance(element, RegionRefIndexedType): # pylint: disable=undefined-variable self.add_RegionRefIndexed(element) elif isinstance(element, OrderedGroupIndexedType): # pylint: disable=undefined-variable diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index ecd094e58c..2f6f59031c 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -252,5 +252,26 @@ def test_get_AllIndexed_index_sort(self): og.sort_AllIndexed() self.assertEqual([x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)], [20, 21]) + def test_extend_AllIndexed_no_validation(self): + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() + og.extend_AllIndexed([ + RegionRefIndexedType(index=3, id='r3'), + RegionRefIndexedType(index=2, id='r2'), + RegionRefIndexedType(index=1, id='r1'), + ]) + rrs = og.get_RegionRefIndexed() + self.assertEqual([x.index for x in rrs][-3:], [22, 23, 24]) + + def test_extend_AllIndexed_validate_continuity(self): + with open('tests/model/TEMP1_Gutachten2-2.xml', 'r') as f: + og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() + with self.assertRaisesRegex(Exception, "@index already used: 1"): + og.extend_AllIndexed([ + RegionRefIndexedType(index=3, id='r3'), + RegionRefIndexedType(index=2, id='r2'), + RegionRefIndexedType(index=1, id='r1'), + ], validate_continuity=True) + if __name__ == '__main__': main() From 84f1d33ce1e61da323868ba3cfbbe73fef08f7d9 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 29 May 2020 23:47:01 +0200 Subject: [PATCH 29/30] :memo: changelog --- CHANGELOG.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae554911a..0758732177 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Added: + + * OcrdPage: `get_AllRegions`: retrieve all regions, sorted by document or reading order, #479 + * OcrdPage: `sort_AllIndexed`: sort all children by `@index` in-place + * OcrdPage: `clear_AllIndexed`: clear all `@index` children + * OcrdPage: `extend_AllIndexed`: Add elements with incrementing `@index` + * OcrdPage: Replace empty reading order groups with equivalent `RegionRef` on export + * OcrdPage: `get_UnorderedGroupChildren`: get reading order elements of an `UnorderedGroup` + + +Changed: + + * OcrdPage: `get_AllIndexed`: allow filtering by child type + * OcrdPage: `get_AllIndexed`: index_sort parameter to enable/disable sorting + + ## [2.7.1] - 2020-05-27 Fixed: From 0e146334ced3ca86f85a1ff9dff3af6ec622c291 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 3 Jun 2020 14:08:55 +0200 Subject: [PATCH 30/30] Document extend_AllIndexed validate_contiunuity param Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py index 0eba30d1ea..73645316c0 100644 --- a/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py +++ b/ocrd_models/ocrd_page_user_methods/extend_AllIndexed.py @@ -2,6 +2,9 @@ def extend_AllIndexed(self, elements, validate_continuity=False): """ Add all elements in list ``elements``, respecting ``@index`` order. + With ``validate_continuity``, check that all new elements come after all old elements + (or raise an exception). + Otherwise, ensure this condition silently (by increasing ``@index`` accordingly). """ if not isinstance(elements, list): elements = [elements] @@ -24,4 +27,3 @@ def extend_AllIndexed(self, elements, validate_continuity=False): elif isinstance(element, UnorderedGroupIndexedType): # pylint: disable=undefined-variable self.add_UnorderedGroupIndexed(element) return self.get_AllIndexed() -