From 1e3e7026a74f3c6446cee6a8a4a2a62d5b2cd313 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 26 Jun 2023 17:50:34 +0200 Subject: [PATCH 01/22] ocrd_mets: add get_physical_pages(for_pageIds=...) --- ocrd_models/ocrd_models/ocrd_mets.py | 33 ++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index dddf3f556..5e232efd2 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -571,15 +571,40 @@ def physical_pages(self): 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS) - def get_physical_pages(self, for_fileIds=None): + def get_physical_pages(self, for_fileIds=None, for_pageIds=None): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``), - optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`. + optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`, + or for a subset selector expression (comma-separated, range, and/or regex) :py:attr:`for_pageIds`. """ - if for_fileIds is None: + if for_fileIds is None and for_pageIds is None: return self.physical_pages + if for_pageIds is not None: + ret = [] + pageId_patterns = [] + for pageId_token in re.split(r',', for_pageIds): + if pageId_token.startswith(REGEX_PREFIX): + pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) + elif '..' in pageId_token: + pageId_patterns += generate_range(*pageId_token.split('..', 1)) + else: + pageId_patterns += [pageId_token] + if self._cache_flag: + for page_id in self._page_cache.keys(): + if page_id in pageId_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): + ret.append(page_id) + else: + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + page_id = page.get('ID') + if page_id in pageId_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): + ret.append(page_id) + return ret + ret = [None] * len(for_fileIds) - if self._cache_flag: for pageId in self._fptr_cache.keys(): for fptr in self._fptr_cache[pageId].keys(): From 07a9fe0db65f7e785a7f5504ba00be95a51d0a2d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 26 Jun 2023 18:53:37 +0200 Subject: [PATCH 02/22] ocrd workspace list-page: --page-id option --- ocrd/ocrd/cli/workspace.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 81be6aa97..aae41903f 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -535,13 +535,20 @@ def list_groups(ctx): # ---------------------------------------------------------------------- @workspace_cli.command('list-page') +@click.option('-g', '--page-id', help="Page ID", metavar='FILTER') @pass_workspace -def list_pages(ctx): +def list_pages(ctx, page_id): """ List physical page IDs + + (If any ``FILTER`` starts with ``//``, then its remainder + will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) - print("\n".join(workspace.mets.physical_pages)) + if page_id is None: + print("\n".join(workspace.mets.physical_pages)) + else: + print("\n".join(workspace.mets.get_physical_pages(for_pageIds=page_id))) # ---------------------------------------------------------------------- # ocrd workspace get-id From 25854c554e205fdd9f80e4fd4184eccdceec5558 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jun 2023 17:32:19 +0200 Subject: [PATCH 03/22] ocrd_mets: expose property physical_pages_labels --- ocrd_models/ocrd_models/ocrd_mets.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 5e232efd2..6f06e03b3 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -753,6 +753,18 @@ def remove_physical_page_fptr(self, fileId): mets_div.remove(mets_fptr) return ret + @property + def physical_pages_labels(self): + """ + Map all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) to their + ``@ORDER``, ``@ORDERLABEL`` and ``@LABEL`` attributes, if any. + """ + divs = self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS) + return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None)) + for div in divs} + def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): """ Add all files from other_mets. From ccb51ce954bb95069d3665cc19545b25abbfde0d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Jun 2023 17:33:07 +0200 Subject: [PATCH 04/22] ocrd workspace list-page: add --output-field, delegating to page labels --- ocrd/ocrd/cli/workspace.py | 58 ++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index aae41903f..26228e95c 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -370,21 +370,22 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp @workspace_cli.command('find') @mets_find_options @click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab", - default=['url'], - multiple=True, - type=click.Choice([ - 'url', - 'mimetype', - 'page_id', - 'pageId', - 'file_id', - 'ID', - 'file_grp', - 'fileGrp', - 'basename', - 'basename_without_extension', - 'local_filename', - ])) + default=['url'], + show_default=True, + multiple=True, + type=click.Choice([ + 'url', + 'mimetype', + 'page_id', + 'pageId', + 'file_id', + 'ID', + 'file_grp', + 'fileGrp', + 'basename', + 'basename_without_extension', + 'local_filename', + ])) @click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ") @click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests") @pass_workspace @@ -536,8 +537,18 @@ def list_groups(ctx): @workspace_cli.command('list-page') @click.option('-g', '--page-id', help="Page ID", metavar='FILTER') +@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab", + default=['ID'], + show_default=True, + multiple=True, + type=click.Choice([ + 'ID', + 'ORDER', + 'ORDERLABEL', + 'LABEL', + ])) @pass_workspace -def list_pages(ctx, page_id): +def list_pages(ctx, page_id, output_field): """ List physical page IDs @@ -546,9 +557,20 @@ def list_pages(ctx, page_id): """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) if page_id is None: - print("\n".join(workspace.mets.physical_pages)) + pages = workspace.mets.physical_pages + else: + pages = workspace.mets.get_physical_pages(for_pageIds=page_id) + if output_field == ['ID']: + print("\n".join(pages)) else: - print("\n".join(workspace.mets.get_physical_pages(for_pageIds=page_id))) + labels = workspace.mets.physical_pages_labels + def field2label(page, field): + if field == 'ID': + return page + return labels[page][['ORDER', 'ORDERLABEL', 'LABEL'].index(field)] + for page in pages: + print("\t".join(field2label(page, field) or '' + for field in output_field)) # ---------------------------------------------------------------------- # ocrd workspace get-id From e181758a9546abdd4b986ebf87fbec218551a346 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Tue, 4 Jul 2023 13:08:36 +0200 Subject: [PATCH 05/22] get phys pages returns strs or divs --- ocrd_models/ocrd_models/ocrd_mets.py | 150 +++++++++++++++------------ 1 file changed, 82 insertions(+), 68 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 6f06e03b3..c7ed8c19c 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -38,6 +38,7 @@ REGEX_PREFIX_LEN = len(REGEX_PREFIX) + class OcrdMets(OcrdXmlDocument): """ API to a single METS file @@ -66,7 +67,8 @@ def __init__(self, **kwargs): if 'OCRD_METS_CACHING' in environ: cache_override = environ['OCRD_METS_CACHING'] in ('true', '1') getLogger('ocrd_models.ocrd_mets').debug('METS Caching %s because OCRD_METS_CACHING is %s', - 'enabled' if cache_override else 'disabled', environ['OCRD_METS_CACHING']) + 'enabled' if cache_override else 'disabled', + environ['OCRD_METS_CACHING']) self._cache_flag = cache_override # If cache is enabled @@ -84,7 +86,8 @@ def __str__(self): """ String representation """ - return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (self._cache_flag, self.file_groups, list(self.find_files())) + return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % ( + self._cache_flag, self.file_groups, list(self.find_files())) def _fill_caches(self): """ @@ -108,7 +111,7 @@ def _fill_caches(self): for el_file in el_fileGrp: file_id = el_file.get('ID') - self._file_cache[fileGrp_use].update({file_id : el_file}) + self._file_cache[fileGrp_use].update({file_id: el_file}) # log.info("File added to the cache: %s" % file_id) # Fill with pages @@ -129,7 +132,7 @@ def _fill_caches(self): # log.info("Page_id added to the cache: %s" % div_id) for el_fptr in el_div: - self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr}) + self._fptr_cache[div_id].update({el_fptr.get('FILEID'): el_fptr}) # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) # log.info("Len of page_cache: %s" % len(self._page_cache)) @@ -143,7 +146,7 @@ def _clear_caches(self): self._file_cache = None self._page_cache = None self._fptr_cache = None - + def refresh_caches(self): if self._cache_flag: # Cache for the files (mets:file) - two nested dictionaries @@ -164,11 +167,11 @@ def refresh_caches(self): # The inner dictionary's Key: 'fptr.FILEID' # The inner dictionary's Value: a 'fptr' object at some memory location self._fptr_cache = {} - + # Note, if the empty_mets() function is used to instantiate OcrdMets # Then the cache is empty even after this operation self._fill_caches() - + @property def unique_identifier(self): """ @@ -179,7 +182,7 @@ def unique_identifier(self): found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS) if found is not None: return found.text - + @unique_identifier.setter def unique_identifier(self, purl): """ @@ -230,7 +233,7 @@ def file_groups(self): # WARNING: Actually we cannot return strings in place of elements! if self._cache_flag: - return list(self._file_cache.keys()) + return list(self._file_cache.keys()) return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)] @@ -267,25 +270,13 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None """ pageId_list = [] if pageId: - pageId_patterns = [] - for pageId_token in re.split(r',', pageId): - if pageId_token.startswith(REGEX_PREFIX): - pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) - elif '..' in pageId_token: - pageId_patterns += generate_range(*pageId_token.split('..', 1)) + # returns divs instead of strings of ids + physical_pages = self.get_physical_pages(for_pageIds=pageId, return_divs=True) + for div in physical_pages: + if self._cache_flag: + pageId_list += self._fptr_cache[div.get('ID')] else: - pageId_patterns += [pageId_token] - if self._cache_flag: - for page_id in self._page_cache.keys(): - if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - pageId_list += self._fptr_cache[page_id] - else: - for page in self._tree.getroot().xpath( - '//mets:div[@TYPE="page"]', namespaces=NS): - if page.get('ID') in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page.get('ID')) for p in pageId_patterns]): - pageId_list += [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)] + pageId_list += [fptr.get('FILEID') for fptr in div.findall('mets:fptr', NS)] if ID and ID.startswith(REGEX_PREFIX): ID = re.compile(ID[REGEX_PREFIX_LEN:]) @@ -295,19 +286,20 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:]) if url and url.startswith(REGEX_PREFIX): url = re.compile(url[REGEX_PREFIX_LEN:]) - + candidates = [] if self._cache_flag: if fileGrp: if isinstance(fileGrp, str): candidates += self._file_cache.get(fileGrp, {}).values() else: - candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()] + candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if + fileGrp.match(fileGrp_needle) for x in el_file_list.values()] else: candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()] else: candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS) - + for cand in candidates: if ID: if isinstance(ID, str): @@ -364,11 +356,11 @@ def add_file_group(self, fileGrp): if el_fileGrp is None: el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP) el_fileGrp.set('USE', fileGrp) - + if self._cache_flag: # Assign an empty dictionary that will hold the files of the added fileGrp self._file_cache[fileGrp] = {} - + return el_fileGrp def rename_file_group(self, old, new): @@ -379,7 +371,7 @@ def rename_file_group(self, old, new): if el_fileGrp is None: raise FileNotFoundError("No such fileGrp '%s'" % old) el_fileGrp.set('USE', new) - + if self._cache_flag: self._file_cache[new] = self._file_cache.pop(old) @@ -406,7 +398,7 @@ def remove_file_group(self, USE, recursive=False, force=False): el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % USE, NS) else: el_fileGrp = USE - if el_fileGrp is None: # pylint: disable=len-as-condition + if el_fileGrp is None: # pylint: disable=len-as-condition msg = "No such fileGrp: %s" % USE if force: log.warning(msg) @@ -434,7 +426,8 @@ def remove_file_group(self, USE, recursive=False, force=False): el_fileGrp.getparent().remove(el_fileGrp) - def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs): + def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, + ignore=False, **kwargs): """ Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`. Arguments: @@ -463,16 +456,19 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None) if mets_file: if mets_file.fileGrp == fileGrp and \ - mets_file.pageId == pageId and \ - mets_file.mimetype == mimetype: + mets_file.pageId == pageId and \ + mets_file.mimetype == mimetype: if not force: - raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set") + raise FileExistsError( + f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set") self.remove_file(ID=ID, fileGrp=fileGrp) else: - raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") + raise FileExistsError( + f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate") # To get rid of Python's FutureWarning - checking if v is not None - kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} + kwargs = {k: v for k, v in locals().items() if + k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None} # This separation is needed to reuse the same el_mets_file element in the caching if block el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE) # The caching of the physical page is done in the OcrdFile constructor @@ -495,7 +491,7 @@ def remove_file(self, *args, **kwargs): if len(files) > 1: return files else: - return files[0] # for backwards-compatibility + return files[0] # for backwards-compatibility if any(1 for kwarg in kwargs if isinstance(kwarg, str) and kwarg.startswith(REGEX_PREFIX)): # allow empty results if filter criteria involve a regex @@ -566,16 +562,17 @@ def physical_pages(self): """ if self._cache_flag: return list(self._page_cache.keys()) - + return self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS) - def get_physical_pages(self, for_fileIds=None, for_pageIds=None): + def get_physical_pages(self, for_fileIds=None, for_pageIds=None, return_divs=False): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``), optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`, or for a subset selector expression (comma-separated, range, and/or regex) :py:attr:`for_pageIds`. + If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: return self.physical_pages @@ -592,16 +589,22 @@ def get_physical_pages(self, for_fileIds=None, for_pageIds=None): if self._cache_flag: for page_id in self._page_cache.keys(): if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - ret.append(page_id) + any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): + if return_divs: + ret.append(self._page_cache[page_id]) + else: + ret.append(page_id) else: for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', namespaces=NS): page_id = page.get('ID') if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - ret.append(page_id) + any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): + if return_divs: + ret.append(page) + else: + ret.append(page_id) return ret ret = [None] * len(for_fileIds) @@ -609,14 +612,22 @@ def get_physical_pages(self, for_fileIds=None, for_pageIds=None): for pageId in self._fptr_cache.keys(): for fptr in self._fptr_cache[pageId].keys(): if fptr in for_fileIds: - ret[for_fileIds.index(fptr)] = pageId + index = for_fileIds.index(fptr) + if return_divs: + ret[index] = self._page_cache[pageId] + else: + ret[index] = pageId else: - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - for fptr in page.findall('mets:fptr', NS): - if fptr.get('FILEID') in for_fileIds: - ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID') + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + for fptr in page.findall('mets:fptr', NS): + if fptr.get('FILEID') in for_fileIds: + index = for_fileIds.index(fptr.get('FILEID')) + if return_divs: + ret[index] = page + else: + ret[index] = page.get('ID') return ret def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None): @@ -657,14 +668,14 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N if el_seqdiv is None: el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV) el_seqdiv.set('TYPE', 'physSequence') - + el_pagediv = None if self._cache_flag: if pageId in self._page_cache: el_pagediv = self._page_cache[pageId] else: el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) - + if el_pagediv is None: el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV) el_pagediv.set('TYPE', 'page') @@ -679,13 +690,13 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N # Create a new entry in the fptr cache and # assign an empty dictionary to hold the fileids self._fptr_cache[pageId] = {} - + el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR) el_fptr.set('FILEID', ocrd_file.ID) if self._cache_flag: # Assign the ocrd fileID to the pageId in the cache - self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr}) + self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID: el_fptr}) def get_physical_page_for_file(self, ocrd_file): """ @@ -740,10 +751,11 @@ def remove_physical_page_fptr(self, fileId): if self._cache_flag: for page_id in self._fptr_cache.keys(): if fileId in self._fptr_cache[page_id].keys(): - mets_fptrs.append(self._fptr_cache[page_id][fileId]) + mets_fptrs.append(self._fptr_cache[page_id][fileId]) else: mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS) + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, + namespaces=NS) ret = [] for mets_fptr in mets_fptrs: mets_div = mets_fptr.getparent() @@ -765,7 +777,8 @@ def physical_pages_labels(self): return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None)) for div in divs} - def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): + def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, + after_add_cb=None, **kwargs): """ Add all files from other_mets. Accepts the same kwargs as :py:func:`find_files` @@ -784,13 +797,14 @@ def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=No pageId_mapping = {} for f_src in other_mets.find_files(**kwargs): f_dest = self.add_file( - fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), - mimetype=f_src.mimetype, - url=f_src.url, - ID=fileId_mapping.get(f_src.ID, f_src.ID), - pageId=pageId_mapping.get(f_src.pageId, f_src.pageId), - force=force) + fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), + mimetype=f_src.mimetype, + url=f_src.url, + ID=fileId_mapping.get(f_src.ID, f_src.ID), + pageId=pageId_mapping.get(f_src.pageId, f_src.pageId), + force=force) # FIXME: merge metsHdr, amdSec, dmdSec as well # FIXME: merge structMap logical and structLink as well if after_add_cb: after_add_cb(f_dest) + From 073d9b04ae6e4c0275e3889f6fb320a21c179506 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 15 Jan 2024 14:57:12 +0100 Subject: [PATCH 06/22] update list-page-workspace with @ORDER --- tests/data/list-page-workspace/mets.xml | 54 ++++++++++++------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/data/list-page-workspace/mets.xml b/tests/data/list-page-workspace/mets.xml index 2786bfe07..3abb84e52 100644 --- a/tests/data/list-page-workspace/mets.xml +++ b/tests/data/list-page-workspace/mets.xml @@ -102,85 +102,85 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + From e91cf50cd74fc6bc04eb4d384a0e8f5f96a775bf Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 15 Jan 2024 19:41:59 +0100 Subject: [PATCH 07/22] add typing info for caches in OcrdMets --- ocrd_models/ocrd_models/ocrd_mets.py | 51 +++++++++++++--------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 1110be830..e905766d8 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -4,8 +4,7 @@ from datetime import datetime import re import typing -from lxml import etree as ET -from copy import deepcopy +from typing import Dict, Optional from warnings import warn from ocrd_utils import ( @@ -34,7 +33,7 @@ METS_XML_EMPTY, ) -from .ocrd_xml_base import OcrdXmlDocument, ET +from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore from .ocrd_file import OcrdFile from .ocrd_agent import OcrdAgent @@ -45,6 +44,23 @@ class OcrdMets(OcrdXmlDocument): """ API to a single METS file """ + _cache_flag : bool + # Cache for the pages (mets:div) + # The dictionary's Key: 'div.ID' + # The dictionary's Value: a 'div' object at some memory location + _page_cache : Optional[Dict[str, ET.Element]] + # Cache for the files (mets:file) - two nested dictionaries + # The outer dictionary's Key: 'fileGrp.USE' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'file.ID' + # The inner dictionary's Value: a 'file' object at some memory location + _file_cache : Optional[Dict[str, Dict[str, ET.Element]]] + # Cache for the file pointers (mets:fptr) - two nested dictionaries + # The outer dictionary's Key: 'div.ID' + # The outer dictionary's Value: Inner dictionary + # The inner dictionary's Key: 'fptr.FILEID' + # The inner dictionary's Value: a 'fptr' object at some memory location + _fptr_cache : Optional[Dict[str, Dict[str, ET.Element]]] @staticmethod def empty_mets(now=None, cache_flag=False): @@ -87,6 +103,10 @@ def _fill_caches(self): Fills the caches with fileGrps and FileIDs """ + assert self._page_cache is not None + assert self._file_cache is not None + assert self._fptr_cache is not None + tree_root = self._tree.getroot() # Fill with files @@ -131,34 +151,11 @@ def _fill_caches(self): # log.info("Len of page_cache: %s" % len(self._page_cache)) # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) - def _clear_caches(self): - """ - Deallocates the caches - """ - - self._file_cache = None - self._page_cache = None - self._fptr_cache = None - def refresh_caches(self): if self._cache_flag: - # Cache for the files (mets:file) - two nested dictionaries - # The outer dictionary's Key: 'fileGrp.USE' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'file.ID' - # The inner dictionary's Value: a 'file' object at some memory location - self._file_cache = {} - # Cache for the pages (mets:div) - # The dictionary's Key: 'div.ID' - # The dictionary's Value: a 'div' object at some memory location + self._file_cache = {} self._page_cache = {} - - # Cache for the file pointers (mets:fptr) - two nested dictionaries - # The outer dictionary's Key: 'div.ID' - # The outer dictionary's Value: Inner dictionary - # The inner dictionary's Key: 'fptr.FILEID' - # The inner dictionary's Value: a 'fptr' object at some memory location self._fptr_cache = {} # Note, if the empty_mets() function is used to instantiate OcrdMets From c642d0401e6652dc0528fef1dc8cbe0da51191c8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 15 Jan 2024 20:05:53 +0100 Subject: [PATCH 08/22] more complete test workspace for page labelling/partitioning --- tests/cli/test_workspace.py | 2 + tests/data/list-page-workspace/mets.xml | 164 ++++++++++++++++++++---- tests/model/test_ocrd_mets.py | 4 - 3 files changed, 139 insertions(+), 31 deletions(-) diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index c96d86b37..e26a6c2fb 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -568,6 +568,8 @@ def _call(args): assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010' from json import loads assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2'])) == [[['PHYS_0008'], ['PHYS_0009'], ['PHYS_0010']]] + assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-k', 'ID', '-k', 'ORDERLABEL', '-D', '3', '-C', '2'])) == \ + [[['PHYS_0008', 'page 7'], ['PHYS_0009', 'page 8'], ['PHYS_0010', 'page 9']]] if __name__ == '__main__': main(__file__) diff --git a/tests/data/list-page-workspace/mets.xml b/tests/data/list-page-workspace/mets.xml index 3abb84e52..5f5d0e306 100644 --- a/tests/data/list-page-workspace/mets.xml +++ b/tests/data/list-page-workspace/mets.xml @@ -99,89 +99,199 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + - + + diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 64ea1eccf..d3ef9e282 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -6,7 +6,6 @@ from os import environ from contextlib import contextmanager import shutil -from logging import StreamHandler import lxml from tests.base import ( @@ -16,9 +15,6 @@ ) from ocrd_utils import ( - initLogging, - disableLogging, - getLogger, VERSION, MIMETYPE_PAGE ) From 9dea95f6c21ac1902c075dcc4f19c705fcd8299e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 15 Jan 2024 20:47:14 +0100 Subject: [PATCH 09/22] replace update-page with a cleaner solution based on get_physical_pages --- ocrd/ocrd/cli/workspace.py | 35 +++++++++++++++++----------- ocrd_models/ocrd_models/constants.py | 9 +++++++ ocrd_models/ocrd_models/ocrd_mets.py | 29 ++++++++++++----------- tests/model/test_ocrd_mets.py | 2 +- 4 files changed, 47 insertions(+), 28 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 0c8d59037..b3e199c5d 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -23,6 +23,7 @@ from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME from ocrd.decorators import mets_find_options from . import command_with_replaced_help +from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTES class WorkspaceCtx(): @@ -601,12 +602,7 @@ def list_groups(ctx): default=['ID'], show_default=True, multiple=True, - type=click.Choice([ - 'ID', - 'ORDER', - 'ORDERLABEL', - 'LABEL', - ])) + type=click.Choice(METS_PAGE_DIV_ATTRIBUTES)) @click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line') @click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int) @click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int) @@ -692,18 +688,31 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin workspace.save_mets() @workspace_cli.command('update-page') -@click.option('--order', help="@ORDER attribute for this mets:div", metavar='ORDER') -@click.option('--orderlabel', help="@ORDERLABEL attribute for this mets:div", metavar='ORDERLABEL') -@click.option('--contentids', help="@CONTENTIDS attribute for this mets:div", metavar='ORDERLABEL') +@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTES}", metavar="ATTR VALUE", nargs=2, multiple=True) +@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') +@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') +@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.argument('PAGE_ID') @pass_workspace -def update_page(ctx, order, orderlabel, contentids, page_id): +def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): """ - Update the @ORDER, @ORDERLABEL o @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID + Update the @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) - workspace.mets.update_physical_page_attributes(page_id, order=order, orderlabel=orderlabel, contentids=contentids) - workspace.save_mets() + update_kwargs = {k: v for k, v in attr_value_pairs} + if order: + update_kwargs['ORDER'] = order + if orderlabel: + update_kwargs['ORDERLABEL'] = orderlabel + if contentids: + update_kwargs['CONTENTIDS'] = contentids + print(update_kwargs) + try: + workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) + workspace.save_mets() + except Exception as err: + print(f"Error: {err}") + sys.exit(1) # ---------------------------------------------------------------------- # ocrd workspace merge diff --git a/ocrd_models/ocrd_models/constants.py b/ocrd_models/ocrd_models/constants.py index 6c8b0e101..11d72d213 100644 --- a/ocrd_models/ocrd_models/constants.py +++ b/ocrd_models/ocrd_models/constants.py @@ -71,3 +71,12 @@ 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown' ] + +METS_PAGE_DIV_ATTRIBUTES = [ + 'ID', + 'ORDER', + 'ORDERLABEL', + 'LABEL', + 'CONTENTIDS' +] + diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index e905766d8..794ecc65a 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -31,6 +31,7 @@ IDENTIFIER_PRIORITY, TAG_MODS_IDENTIFIER, METS_XML_EMPTY, + METS_PAGE_DIV_ATTRIBUTES ) from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore @@ -720,20 +721,20 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID: el_fptr}) def update_physical_page_attributes(self, page_id, **kwargs): - mets_div = None - if self._cache_flag: - if page_id in self._page_cache.keys(): - mets_div = [self._page_cache[page_id]] - else: - mets_div = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % page_id, - namespaces=NS) - if mets_div: - for attr_name, attr_value in kwargs.items(): - if attr_value: - mets_div[0].set(attr_name.upper(), attr_value) - else: - warn("Could not find mets:div[@ID={page_id}]") + invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTES) + if invalid_keys: + raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTES}") + + page_div = self.get_physical_pages(for_pageIds=page_id, return_divs=True) + if not page_div: + raise ValueError(f"Could not find mets:div[@ID=={page_id}]") + page_div = page_div[0] + + for k, v in kwargs.items(): + if not v: + page_div.attrib.pop(k) + else: + page_div.attrib[k] = v def get_physical_page_for_file(self, ocrd_file): """ diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index d3ef9e282..f31a74b3d 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -388,7 +388,7 @@ def test_update_physical_page_attributes(sbb_directory_ocrd_mets): assert len(m.physical_pages) == 1 assert b'ORDER' not in m.to_xml() assert b'ORDERLABEL' not in m.to_xml() - m.update_physical_page_attributes('new page', order='foo', orderlabel='bar') + m.update_physical_page_attributes('new page', ORDER='foo', ORDERLABEL='bar') assert b'ORDER' in m.to_xml() assert b'ORDERLABEL' in m.to_xml() From cfd1c9158688be90526b57355bbd3647e1d834c6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 16 Jan 2024 14:53:14 +0100 Subject: [PATCH 10/22] OcrdMets: extend the _page_cache to include all METS_PAGE_DIV_ATTRIBUTEs --- ocrd/ocrd/cli/workspace.py | 7 ++- ocrd_models/ocrd_models/constants.py | 19 ++++---- ocrd_models/ocrd_models/ocrd_mets.py | 69 +++++++++++++++------------- 3 files changed, 51 insertions(+), 44 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index b3e199c5d..f548e8e25 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -23,7 +23,7 @@ from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME from ocrd.decorators import mets_find_options from . import command_with_replaced_help -from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTES +from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE class WorkspaceCtx(): @@ -602,7 +602,7 @@ def list_groups(ctx): default=['ID'], show_default=True, multiple=True, - type=click.Choice(METS_PAGE_DIV_ATTRIBUTES)) + type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names())) @click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line') @click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int) @click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int) @@ -688,7 +688,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin workspace.save_mets() @workspace_cli.command('update-page') -@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTES}", metavar="ATTR VALUE", nargs=2, multiple=True) +@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True) @click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') @click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @@ -706,7 +706,6 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): update_kwargs['ORDERLABEL'] = orderlabel if contentids: update_kwargs['CONTENTIDS'] = contentids - print(update_kwargs) try: workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() diff --git a/ocrd_models/ocrd_models/constants.py b/ocrd_models/ocrd_models/constants.py index 11d72d213..7bba43267 100644 --- a/ocrd_models/ocrd_models/constants.py +++ b/ocrd_models/ocrd_models/constants.py @@ -2,7 +2,7 @@ Constants for ocrd_models. """ from pkg_resources import resource_string -import re +from enum import Enum, auto __all__ = [ 'IDENTIFIER_PRIORITY', @@ -27,6 +27,7 @@ 'TAG_PAGE_TEXTLINE', 'TAG_PAGE_TEXTEQUIV', 'TAG_PAGE_TEXTREGION', + 'METS_PAGE_DIV_ATTRIBUTE', ] @@ -72,11 +73,13 @@ 'Separator', 'Table', 'Text', 'Unknown' ] -METS_PAGE_DIV_ATTRIBUTES = [ - 'ID', - 'ORDER', - 'ORDERLABEL', - 'LABEL', - 'CONTENTIDS' -] +class METS_PAGE_DIV_ATTRIBUTE(Enum): + ID = auto() + ORDER = auto() + ORDERLABEL = auto() + LABEL = auto() + CONTENTIDS = auto() + @classmethod + def names(cls): + return [x.name for x in cls] diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 794ecc65a..f2017df10 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -31,7 +31,7 @@ IDENTIFIER_PRIORITY, TAG_MODS_IDENTIFIER, METS_XML_EMPTY, - METS_PAGE_DIV_ATTRIBUTES + METS_PAGE_DIV_ATTRIBUTE ) from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore @@ -40,7 +40,6 @@ REGEX_PREFIX_LEN = len(REGEX_PREFIX) - class OcrdMets(OcrdXmlDocument): """ API to a single METS file @@ -49,19 +48,19 @@ class OcrdMets(OcrdXmlDocument): # Cache for the pages (mets:div) # The dictionary's Key: 'div.ID' # The dictionary's Value: a 'div' object at some memory location - _page_cache : Optional[Dict[str, ET.Element]] + _page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET.Element]] # Cache for the files (mets:file) - two nested dictionaries # The outer dictionary's Key: 'fileGrp.USE' # The outer dictionary's Value: Inner dictionary # The inner dictionary's Key: 'file.ID' # The inner dictionary's Value: a 'file' object at some memory location - _file_cache : Optional[Dict[str, Dict[str, ET.Element]]] + _file_cache : Dict[str, Dict[str, ET.Element]] # Cache for the file pointers (mets:fptr) - two nested dictionaries # The outer dictionary's Key: 'div.ID' # The outer dictionary's Value: Inner dictionary # The inner dictionary's Key: 'fptr.FILEID' # The inner dictionary's Value: a 'fptr' object at some memory location - _fptr_cache : Optional[Dict[str, Dict[str, ET.Element]]] + _fptr_cache : Dict[str, Dict[str, ET.Element]] @staticmethod def empty_mets(now=None, cache_flag=False): @@ -88,9 +87,11 @@ def __init__(self, **kwargs): 'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING')) self._cache_flag = config.OCRD_METS_CACHING + # If cache is enabled if self._cache_flag: - self.refresh_caches() + self._initialize_caches() + self._refresh_caches() def __str__(self): """ @@ -104,10 +105,6 @@ def _fill_caches(self): Fills the caches with fileGrps and FileIDs """ - assert self._page_cache is not None - assert self._file_cache is not None - assert self._fptr_cache is not None - tree_root = self._tree.getroot() # Fill with files @@ -138,7 +135,8 @@ def _fill_caches(self): div_id = el_div.get('ID') log.debug("DIV_ID: %s" % el_div.get('ID')) - self._page_cache[div_id] = el_div + for attr in METS_PAGE_DIV_ATTRIBUTE: + self._page_cache[attr][el_div.get(attr.name)] = el_div # Assign an empty dictionary that will hold the fptr of the added page (div) self._fptr_cache[div_id] = {} @@ -149,15 +147,18 @@ def _fill_caches(self): self._fptr_cache[div_id].update({el_fptr.get('FILEID'): el_fptr}) # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID')) - # log.info("Len of page_cache: %s" % len(self._page_cache)) + # log.info("Len of page_cache: %s" % len(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID])) # log.info("Len of fptr_cache: %s" % len(self._fptr_cache)) - def refresh_caches(self): - if self._cache_flag: + def _initialize_caches(self): + self._file_cache = {} + # NOTE we can only guarantee uniqueness for @ID and @ORDER + self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE} + self._fptr_cache = {} - self._file_cache = {} - self._page_cache = {} - self._fptr_cache = {} + def _refresh_caches(self): + if self._cache_flag: + self._initialize_caches() # Note, if the empty_mets() function is used to instantiate OcrdMets # Then the cache is empty even after this operation @@ -563,8 +564,9 @@ def remove_one_file(self, ID, fileGrp=None): page_div.getparent().remove(page_div) # Delete the empty pages from caches as well if self._cache_flag: - del self._page_cache[page_div.get('ID')] - del self._fptr_cache[page_div.get('ID')] + for attr in METS_PAGE_DIV_ATTRIBUTE: + if attr.name in page_div.attrib: + del self._page_cache[attr][page_div.attrib[attr.name]] # Delete the file reference from the cache if self._cache_flag: @@ -583,13 +585,13 @@ def physical_pages(self): List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) """ if self._cache_flag: - return list(self._page_cache.keys()) + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys()) return [str(x) for x in self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS)] - def get_physical_pages(self, for_fileIds=None, for_pageIds=None, return_divs=False): + def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : Optional[str] = None, return_divs : bool = False): """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``), optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`, @@ -609,11 +611,11 @@ def get_physical_pages(self, for_fileIds=None, for_pageIds=None, return_divs=Fal else: pageId_patterns += [pageId_token] if self._cache_flag: - for page_id in self._page_cache.keys(): + for page_id in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): if page_id in pageId_patterns or \ any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): if return_divs: - ret.append(self._page_cache[page_id]) + ret.append(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][page_id]) else: ret.append(page_id) else: @@ -636,7 +638,7 @@ def get_physical_pages(self, for_fileIds=None, for_pageIds=None, return_divs=Fal if fptr in for_fileIds: index = for_fileIds.index(fptr) if return_divs: - ret[index] = self._page_cache[pageId] + ret[index] = self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] else: ret[index] = pageId else: @@ -694,7 +696,7 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv = None if self._cache_flag: if pageId in self._page_cache: - el_pagediv = self._page_cache[pageId] + el_pagediv = self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] else: el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS) @@ -708,7 +710,7 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N el_pagediv.set('ORDERLABEL', orderlabel) if self._cache_flag: # Create a new entry in the page cache - self._page_cache[pageId] = el_pagediv + self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] = el_pagediv # Create a new entry in the fptr cache and # assign an empty dictionary to hold the fileids self._fptr_cache[pageId] = {} @@ -721,9 +723,9 @@ def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=N self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID: el_fptr}) def update_physical_page_attributes(self, page_id, **kwargs): - invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTES) + invalid_keys = list(k for k in kwargs.keys() if k not in METS_PAGE_DIV_ATTRIBUTE.names()) if invalid_keys: - raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTES}") + raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}") page_div = self.get_physical_pages(for_pageIds=page_id, return_divs=True) if not page_div: @@ -745,7 +747,7 @@ def get_physical_page_for_file(self, ocrd_file): if self._cache_flag: for pageId in self._fptr_cache.keys(): if ocrd_file.ID in self._fptr_cache[pageId].keys(): - ret.append(self._page_cache[pageId].get('ID')) + ret.append(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId].get('ID')) else: ret = self._tree.getroot().xpath( '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' % @@ -761,16 +763,19 @@ def remove_physical_page(self, ID): """ mets_div = None if self._cache_flag: - if ID in self._page_cache.keys(): - mets_div = [self._page_cache[ID]] + if ID in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): + mets_div = [self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][ID]] else: mets_div = self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID, namespaces=NS) if mets_div: + mets_div_attrib = {** mets_div[0].attrib} mets_div[0].getparent().remove(mets_div[0]) if self._cache_flag: - del self._page_cache[ID] + for attr in METS_PAGE_DIV_ATTRIBUTE: + if attr.name in mets_div_attrib: + del self._page_cache[attr][mets_div_attrib[attr.name]] del self._fptr_cache[ID] def remove_physical_page_fptr(self, fileId): From ee8fb6996ddd1976a418c30fe426eed94ca52623 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 16 Jan 2024 17:16:09 +0100 Subject: [PATCH 11/22] implement generic page attribute ranges --- ocrd/ocrd/cli/workspace.py | 4 +- ocrd_models/ocrd_models/ocrd_mets.py | 70 ++++++++++++++++++---------- ocrd_utils/pyproject.toml | 7 +-- tests/model/test_ocrd_mets.py | 2 + 4 files changed, 52 insertions(+), 31 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index f548e8e25..7480f5962 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -696,9 +696,8 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin @pass_workspace def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): """ - Update the @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID + Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) update_kwargs = {k: v for k, v in attr_value_pairs} if order: update_kwargs['ORDER'] = order @@ -707,6 +706,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index f2017df10..ab50694d5 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -136,7 +136,7 @@ def _fill_caches(self): log.debug("DIV_ID: %s" % el_div.get('ID')) for attr in METS_PAGE_DIV_ATTRIBUTE: - self._page_cache[attr][el_div.get(attr.name)] = el_div + self._page_cache[attr][str(el_div.get(attr.name))] = el_div # Assign an empty dictionary that will hold the fptr of the added page (div) self._fptr_cache[div_id] = {} @@ -600,37 +600,59 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O """ if for_fileIds is None and for_pageIds is None: return self.physical_pages + log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] - pageId_patterns = [] + page_attr_patterns = [] for pageId_token in re.split(r',', for_pageIds): if pageId_token.startswith(REGEX_PREFIX): - pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) + page_attr_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) elif '..' in pageId_token: - pageId_patterns += generate_range(*pageId_token.split('..', 1)) + page_attr_patterns += generate_range(*pageId_token.split('..', 1)) else: - pageId_patterns += [pageId_token] - if self._cache_flag: - for page_id in self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].keys(): - if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - if return_divs: - ret.append(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][page_id]) - else: - ret.append(page_id) - else: - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - page_id = page.get('ID') - if page_id in pageId_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]): - if return_divs: - ret.append(page) - else: - ret.append(page_id) + page_attr_patterns += [pageId_token] + if page_attr_patterns: + if self._cache_flag: + # determine attr to look for before iterating + try: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if ( + any(p in self._page_cache[a] for p in page_attr_patterns) or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \ + for p in page_attr_patterns \ + for attr_val in self._page_cache[a]] + ))) + for attr_val in self._page_cache[attr].keys(): + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(self._page_cache[attr][attr_val]) + else: + ret.append(attr_val) + except StopIteration: + log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") + else: + # determine attr during iterating + attr = None + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + try: + if not attr: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \ + page.get(a.name) in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns])) + attr_val = page.get(attr.name) + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(page) + else: + ret.append(attr_val) + except StopIteration: + log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}") return ret + assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright ret = [None] * len(for_fileIds) if self._cache_flag: for pageId in self._fptr_cache.keys(): diff --git a/ocrd_utils/pyproject.toml b/ocrd_utils/pyproject.toml index dec419a9b..b34a8732b 100644 --- a/ocrd_utils/pyproject.toml +++ b/ocrd_utils/pyproject.toml @@ -1,18 +1,18 @@ [build-system] requires = [ "setuptools>=61", - "setuptools_scm[toml]", "wheel", ] # PEP 508 specifications. build-backend = "setuptools.build_meta" [project] name = "ocrd_utils" +version = "1.2.3" authors = [{name = "Konstantin Baierer", email = "unixprog@gmail.com"}] license = {text = "Apache License 2.0"} description = "OCR-D framework - shared code, helpers, constants" requires-python = ">=3.7" -dynamic = ["version", "dependencies"] +dynamic = ["dependencies"] [project.readme] file = "README.md" @@ -35,6 +35,3 @@ include-package-data = true [tool.setuptools.packages.find] namespaces = false - -[tool.setuptools_scm] -root = ".." diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index f31a74b3d..15636e67f 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -90,6 +90,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(sbb_sample_01.find_all_files(pageId='0..100')) == 35, '35 files in @ORDER range 1..10' def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', From 1427c07b8d49904d00caf1766861c54f89c66d27 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 17 Jan 2024 13:02:17 +0100 Subject: [PATCH 12/22] utils.generate_range: raise a ValueError if non-numeric parts differ --- ocrd_utils/ocrd_utils/str.py | 2 ++ tests/cli/test_workspace.py | 1 + tests/model/test_ocrd_mets.py | 3 ++- tests/test_utils.py | 10 ++++++++-- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index f5b9242d3..cf5cd3778 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -204,6 +204,8 @@ def generate_range(start, end): start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1] except IndexError: raise ValueError("Range '%s..%s': could not find numeric part" % (start, end)) + if start[:-len(start_num)] != end[:-len(end_num)]: + raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'") if start_num == end_num: warn("Range '%s..%s': evaluates to the same number") for i in range(int(start_num), int(end_num) + 1): diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index e26a6c2fb..b155811ac 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -563,6 +563,7 @@ def _call(args): assert _call(['-f', 'json']) == '[[["PHYS_0001"], ["PHYS_0002"], ["PHYS_0003"], ["PHYS_0004"], ["PHYS_0005"], ["PHYS_0006"], ["PHYS_0008"], ["PHYS_0009"], ["PHYS_0010"], ["PHYS_0011"], ["PHYS_0012"], ["PHYS_0013"], ["PHYS_0014"], ["PHYS_0015"], ["PHYS_0016"], ["PHYS_0017"], ["PHYS_0018"], ["PHYS_0019"], ["PHYS_0020"], ["PHYS_0022"], ["PHYS_0023"], ["PHYS_0024"], ["PHYS_0025"], ["PHYS_0026"], ["PHYS_0027"], ["PHYS_0028"], ["PHYS_0029"]]]' assert _call(['-f', 'comma-separated', '-R', '5..5']) == 'PHYS_0005' assert _call(['-f', 'comma-separated', '-R', '6..8']) == 'PHYS_0006,PHYS_0008,PHYS_0009' + assert _call(['-f', 'comma-separated', '-r', '1..5']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0006..PHYS_0009']) == 'PHYS_0006,PHYS_0008,PHYS_0009' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3']) == 'PHYS_0001,PHYS_0002,PHYS_0003\nPHYS_0004,PHYS_0005,PHYS_0006\nPHYS_0008,PHYS_0009,PHYS_0010' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010' diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 15636e67f..631f61d0d 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -91,7 +91,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' - assert len(sbb_sample_01.find_all_files(pageId='0..100')) == 35, '35 files in @ORDER range 1..10' + assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' + assert len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) == 35, '35 files in @ORDER range 1..10' def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', diff --git a/tests/test_utils.py b/tests/test_utils.py index d2093c465..2dac359c4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -295,9 +295,15 @@ def test_make_file_id_744(): def test_generate_range(): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] with raises(ValueError, match='could not find numeric part'): - generate_range('NONUMBER', 'ALSO_NONUMBER') + assert generate_range('NONUMBER', 'ALSO_NONUMBER') + with raises(ValueError, match='differ in their non-numeric part'): + generate_range('PHYS_0001_123', 'PHYS_0010_123') + with raises(ValueError, match='differ in their non-numeric part'): + assert generate_range('1', 'PHYS_0005') == 0 + with raises(ValueError, match='differ in their non-numeric part'): + assert generate_range('1', 'page 5') == 0 with warns(UserWarning, match='same number'): - generate_range('PHYS_0001_123', 'PHYS_0010_123') == 'PHYS_0001_123' + assert generate_range('PHYS_0001_123', 'PHYS_0001_123') == ['PHYS_0001_123'] def test_safe_filename(): assert safe_filename('Hello world,!') == 'Hello_world_' From c36360dc7dd47776cd170d954c0a183623e7de39 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 17 Jan 2024 13:26:35 +0100 Subject: [PATCH 13/22] fix tests --- tests/cli/test_workspace.py | 2 ++ tests/model/test_ocrd_mets.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index b155811ac..4632fe1f5 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -564,6 +564,8 @@ def _call(args): assert _call(['-f', 'comma-separated', '-R', '5..5']) == 'PHYS_0005' assert _call(['-f', 'comma-separated', '-R', '6..8']) == 'PHYS_0006,PHYS_0008,PHYS_0009' assert _call(['-f', 'comma-separated', '-r', '1..5']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005' + assert _call(['-f', 'comma-separated', '-r', '2..3']) == 'PHYS_0002,PHYS_0003' + assert _call(['-f', 'comma-separated', '-r', 'page 2..page 3']) == 'PHYS_0002,PHYS_0003' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0006..PHYS_0009']) == 'PHYS_0006,PHYS_0008,PHYS_0009' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3']) == 'PHYS_0001,PHYS_0002,PHYS_0003\nPHYS_0004,PHYS_0005,PHYS_0006\nPHYS_0008,PHYS_0009,PHYS_0010' assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010' diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 631f61d0d..f33cf2396 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -92,7 +92,8 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' - assert len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) == 35, '35 files in @ORDER range 1..10' + with raises(ValueError, match='differ in their non-numeric part'): + len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', From 3a60c1ff58a112574da345f412159597ea5d8551 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 17 Jan 2024 13:43:54 +0100 Subject: [PATCH 14/22] revert accidental commit to ocrd_utils/pyproject.toml --- ocrd_utils/pyproject.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ocrd_utils/pyproject.toml b/ocrd_utils/pyproject.toml index b34a8732b..dec419a9b 100644 --- a/ocrd_utils/pyproject.toml +++ b/ocrd_utils/pyproject.toml @@ -1,18 +1,18 @@ [build-system] requires = [ "setuptools>=61", + "setuptools_scm[toml]", "wheel", ] # PEP 508 specifications. build-backend = "setuptools.build_meta" [project] name = "ocrd_utils" -version = "1.2.3" authors = [{name = "Konstantin Baierer", email = "unixprog@gmail.com"}] license = {text = "Apache License 2.0"} description = "OCR-D framework - shared code, helpers, constants" requires-python = ">=3.7" -dynamic = ["dependencies"] +dynamic = ["version", "dependencies"] [project.readme] file = "README.md" @@ -35,3 +35,6 @@ include-package-data = true [tool.setuptools.packages.find] namespaces = false + +[tool.setuptools_scm] +root = ".." From 517814bf654770695dba617a35de2adf55e98847 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 30 Jan 2024 19:38:25 +0100 Subject: [PATCH 15/22] get_physical_pages: return early if no patterns --- src/ocrd_models/ocrd_mets.py | 75 ++++++++++++++++++----------------- tests/model/test_ocrd_mets.py | 2 +- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 95c29ade8..6be47ad16 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -611,45 +611,46 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O page_attr_patterns += generate_range(*pageId_token.split('..', 1)) else: page_attr_patterns += [pageId_token] - if page_attr_patterns: - if self._cache_flag: - # determine attr to look for before iterating + if not page_attr_patterns: + return [] + if self._cache_flag: + # determine attr to look for before iterating + try: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if ( + any(p in self._page_cache[a] for p in page_attr_patterns) or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \ + for p in page_attr_patterns \ + for attr_val in self._page_cache[a]] + ))) + for attr_val in self._page_cache[attr].keys(): + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(self._page_cache[attr][attr_val]) + else: + ret.append(attr_val) + except StopIteration: + log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") + else: + # determine attr during iterating + attr = None + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): try: - attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if ( - any(p in self._page_cache[a] for p in page_attr_patterns) or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \ - for p in page_attr_patterns \ - for attr_val in self._page_cache[a]] - ))) - for attr_val in self._page_cache[attr].keys(): - if attr_val in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): - if return_divs: - ret.append(self._page_cache[attr][attr_val]) - else: - ret.append(attr_val) + if not attr: + attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \ + page.get(a.name) in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns])) + attr_val = page.get(attr.name) + if attr_val in page_attr_patterns or \ + any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): + if return_divs: + ret.append(page) + else: + ret.append(attr_val) except StopIteration: - log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") - else: - # determine attr during iterating - attr = None - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - try: - if not attr: - attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \ - page.get(a.name) in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns])) - attr_val = page.get(attr.name) - if attr_val in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): - if return_divs: - ret.append(page) - else: - ret.append(attr_val) - except StopIteration: - log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}") + log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}") return ret assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index f33cf2396..125f2ac1e 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -92,7 +92,7 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' - with raises(ValueError, match='differ in their non-numeric part'): + with pytest.raises(ValueError, match='differ in their non-numeric part'): len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) def test_find_all_files_local_only(sbb_sample_01): From 1225912251b40c2688e1b4414d8b53198b5f0420 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 6 Feb 2024 16:52:23 +0100 Subject: [PATCH 16/22] OcrdMets.find_all_files: fix page attr loop --- src/ocrd_models/ocrd_mets.py | 78 ++++++++++++++++++++--------------- src/ocrd_utils/str.py | 2 +- tests/model/test_ocrd_mets.py | 10 ++++- 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 6be47ad16..5838a316f 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -214,7 +214,6 @@ def add_agent(self, *args, **kwargs): el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - # print(ET.tostring(el_metsHdr)) return OcrdAgent(el_agent, *args, **kwargs) @property @@ -608,49 +607,60 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O if pageId_token.startswith(REGEX_PREFIX): page_attr_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) elif '..' in pageId_token: - page_attr_patterns += generate_range(*pageId_token.split('..', 1)) + val_range = generate_range(*pageId_token.split('..', 1)) + if val_range: + page_attr_patterns.append(val_range) else: - page_attr_patterns += [pageId_token] + page_attr_patterns.append(pageId_token) if not page_attr_patterns: return [] if self._cache_flag: # determine attr to look for before iterating try: - attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if ( - any(p in self._page_cache[a] for p in page_attr_patterns) or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) \ - for p in page_attr_patterns \ - for attr_val in self._page_cache[a]] - ))) - for attr_val in self._page_cache[attr].keys(): - if attr_val in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): - if return_divs: - ret.append(self._page_cache[attr][attr_val]) - else: - ret.append(attr_val) + for pat in page_attr_patterns: + # for attr in list(METS_PAGE_DIV_ATTRIBUTE): + attr : METS_PAGE_DIV_ATTRIBUTE + if isinstance(pat, str): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a]) + cache_keys = [pat] + elif isinstance(pat, list): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] in self._page_cache[a]) + cache_keys = [v for v in pat if v in self._page_cache[attr]] + elif isinstance(pat, typing.Pattern): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if pat.fullmatch(v)) + cache_keys = [v for v in self._page_cache[attr] if pat.fullmatch(v)] + else: + raise ValueError + if return_divs: + ret += [self._page_cache[attr][v] for v in cache_keys] + else: + ret += cache_keys except StopIteration: log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") else: - # determine attr during iterating - attr = None - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): - try: - if not attr: - attr = next(a for a in METS_PAGE_DIV_ATTRIBUTE if \ - page.get(a.name) in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(page.get(a.name)) for p in page_attr_patterns])) - attr_val = page.get(attr.name) - if attr_val in page_attr_patterns or \ - any([isinstance(p, typing.Pattern) and p.fullmatch(attr_val) for p in page_attr_patterns]): - if return_divs: - ret.append(page) + while page_attr_patterns: + pat = page_attr_patterns.pop(0) + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + try: + if isinstance(pat, str): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat == page.get(a.name)) + ret.append(page if return_divs else pat) + elif isinstance(pat, list): + if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE): + pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] == page.get(a.name))) + attr_val = page.get(pat[0].name) + if attr_val in pat: + pat.remove(attr_val) + ret.append(page if return_divs else attr_val) + elif isinstance(pat, typing.Pattern): + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat.fullmatch(page.get(a.name) or '')) + ret.append(page if return_divs else page.get(attr.name)) else: - ret.append(attr_val) - except StopIteration: - log.debug(f"No pattern matches any mets:div attributes. patterns: {page_attr_patterns}") + raise ValueError + except StopIteration: + log.debug(f"No mets:div attributes match pattern {pat}") return ret assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index e1e77ccc5..6a9986ae7 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -196,7 +196,7 @@ def safe_filename(url): # print('safe filename: %s -> %s' % (url, ret)) return ret -def generate_range(start, end): +def generate_range(start : str, end : str) -> List[str]: """ Generate a list of strings by incrementing the number part of ``start`` until including ``end``. """ diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 125f2ac1e..50a172e38 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -80,18 +80,26 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' + # assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' + # assert [str(x.ID) for x in sbb_sample_01.find_all_files()] == [str(x.ID) for x in sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')] + #print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')]) assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' + print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')]) assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' + assert len(sbb_sample_01.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' + assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(sbb_sample_01.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(sbb_sample_01.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' with pytest.raises(ValueError, match='differ in their non-numeric part'): len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) From 4a25d1e0e96aaee06366efdfba91addba1fa69a2 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 8 Feb 2024 11:08:13 +0100 Subject: [PATCH 17/22] OcrdMets.get_physical_pages should return IDs if not return_divs Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- src/ocrd_models/ocrd_mets.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 5838a316f..5bb689938 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -615,7 +615,6 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O if not page_attr_patterns: return [] if self._cache_flag: - # determine attr to look for before iterating try: for pat in page_attr_patterns: # for attr in list(METS_PAGE_DIV_ATTRIBUTE): @@ -634,7 +633,7 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O if return_divs: ret += [self._page_cache[attr][v] for v in cache_keys] else: - ret += cache_keys + ret += [self._page_cache[attr][v].get('ID') for v in cache_keys] except StopIteration: log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") else: @@ -646,17 +645,17 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O try: if isinstance(pat, str): attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat == page.get(a.name)) - ret.append(page if return_divs else pat) + ret.append(page if return_divs else page.get('ID')) elif isinstance(pat, list): if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE): pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] == page.get(a.name))) attr_val = page.get(pat[0].name) if attr_val in pat: pat.remove(attr_val) - ret.append(page if return_divs else attr_val) + ret.append(page if return_divs else page.get('ID')) elif isinstance(pat, typing.Pattern): attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat.fullmatch(page.get(a.name) or '')) - ret.append(page if return_divs else page.get(attr.name)) + ret.append(page if return_divs else page.get('ID')) else: raise ValueError except StopIteration: From 466c61d0e50f944a7e6b4224f44b828f17bfc873 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 8 Feb 2024 12:04:38 +0100 Subject: [PATCH 18/22] OcrdMets.get_physical_pages: Cache the attribute in the non-cached regex case --- src/ocrd_models/ocrd_mets.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 5bb689938..6416cfebb 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -605,7 +605,7 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O page_attr_patterns = [] for pageId_token in re.split(r',', for_pageIds): if pageId_token.startswith(REGEX_PREFIX): - page_attr_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:])) + page_attr_patterns.append((None, re.compile(pageId_token[REGEX_PREFIX_LEN:]))) elif '..' in pageId_token: val_range = generate_range(*pageId_token.split('..', 1)) if val_range: @@ -625,9 +625,10 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O elif isinstance(pat, list): attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] in self._page_cache[a]) cache_keys = [v for v in pat if v in self._page_cache[attr]] - elif isinstance(pat, typing.Pattern): - attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if pat.fullmatch(v)) - cache_keys = [v for v in self._page_cache[attr] if pat.fullmatch(v)] + elif isinstance(pat, tuple): + _, re_pat = pat + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if re_pat.fullmatch(v)) + cache_keys = [v for v in self._page_cache[attr] if re_pat.fullmatch(v)] else: raise ValueError if return_divs: @@ -653,9 +654,13 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O if attr_val in pat: pat.remove(attr_val) ret.append(page if return_divs else page.get('ID')) - elif isinstance(pat, typing.Pattern): - attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat.fullmatch(page.get(a.name) or '')) - ret.append(page if return_divs else page.get('ID')) + elif isinstance(pat, tuple): + attr, re_pat = pat + if not attr: + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if re_pat.fullmatch(page.get(a.name) or '')) + pat = (attr, re_pat) + if re_pat.fullmatch(page.get(attr.name) or ''): + ret.append(page if return_divs else page.get('ID')) else: raise ValueError except StopIteration: From 9f8406716adc33e2ba760b9f191d84f8eeb04f07 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 8 Feb 2024 13:48:29 +0100 Subject: [PATCH 19/22] OcrdMets.get_physical_pages: raise ValueError if a pattern matches no pages --- src/ocrd_models/ocrd_mets.py | 21 ++++++++++++++------- tests/model/test_ocrd_mets.py | 9 ++++++++- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 6416cfebb..2ff3ecbf1 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -608,15 +608,14 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O page_attr_patterns.append((None, re.compile(pageId_token[REGEX_PREFIX_LEN:]))) elif '..' in pageId_token: val_range = generate_range(*pageId_token.split('..', 1)) - if val_range: - page_attr_patterns.append(val_range) + page_attr_patterns.append(val_range) else: page_attr_patterns.append(pageId_token) if not page_attr_patterns: return [] if self._cache_flag: - try: - for pat in page_attr_patterns: + for pat in page_attr_patterns: + try: # for attr in list(METS_PAGE_DIV_ATTRIBUTE): attr : METS_PAGE_DIV_ATTRIBUTE if isinstance(pat, str): @@ -635,9 +634,13 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O ret += [self._page_cache[attr][v] for v in cache_keys] else: ret += [self._page_cache[attr][v].get('ID') for v in cache_keys] - except StopIteration: - log.debug(f"No pattern matches any keys of any of the _page_caches. patterns: {page_attr_patterns}") + except StopIteration: + raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.") else: + page_attr_patterns_copy = list(page_attr_patterns) + page_attr_patterns_idx = set(range(len(page_attr_patterns))) + pat_idx_matched_at_least_once = set() + pat_idx = 0 while page_attr_patterns: pat = page_attr_patterns.pop(0) for page in self._tree.getroot().xpath( @@ -664,7 +667,11 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O else: raise ValueError except StopIteration: - log.debug(f"No mets:div attributes match pattern {pat}") + continue + pat_idx_matched_at_least_once.add(pat_idx) + pat_idx += 1 + if unmatched_idx := page_attr_patterns_idx - pat_idx_matched_at_least_once: + raise ValueError(f"Patterns {[page_attr_patterns_copy[x] for x in unmatched_idx]} match none of the pages") return ret assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 50a172e38..dbd81f492 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -5,6 +5,7 @@ from os.path import join from os import environ from contextlib import contextmanager +import re import shutil import lxml @@ -90,7 +91,7 @@ def test_find_all_files(sbb_sample_01): #print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')]) assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' - print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')]) + # print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')]) assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' @@ -102,6 +103,12 @@ def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' with pytest.raises(ValueError, match='differ in their non-numeric part'): len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + sbb_sample_01.find_all_files(pageId='PHYS_0006..PHYS_0029') + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + sbb_sample_01.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + sbb_sample_01.find_all_files(pageId='//PHYS000.*') def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', From 264783102f36e79c14a1b53015984217ba8fb081 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 8 Feb 2024 14:38:49 +0100 Subject: [PATCH 20/22] OcrdMets.get_physical_pages: iterate over pages, then patterns in non-cached branch --- src/ocrd_models/ocrd_mets.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 2ff3ecbf1..4e39bf85d 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -616,7 +616,6 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O if self._cache_flag: for pat in page_attr_patterns: try: - # for attr in list(METS_PAGE_DIV_ATTRIBUTE): attr : METS_PAGE_DIV_ATTRIBUTE if isinstance(pat, str): attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a]) @@ -638,18 +637,17 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.") else: page_attr_patterns_copy = list(page_attr_patterns) - page_attr_patterns_idx = set(range(len(page_attr_patterns))) - pat_idx_matched_at_least_once = set() - pat_idx = 0 - while page_attr_patterns: - pat = page_attr_patterns.pop(0) - for page in self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', - namespaces=NS): + page_attr_patterns_matched = [] + for page in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS): + patterns_exhausted = [] + for pat_idx, pat in enumerate(page_attr_patterns): try: if isinstance(pat, str): attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat == page.get(a.name)) ret.append(page if return_divs else page.get('ID')) + patterns_exhausted.append(pat) elif isinstance(pat, list): if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE): pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] == page.get(a.name))) @@ -657,21 +655,25 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O if attr_val in pat: pat.remove(attr_val) ret.append(page if return_divs else page.get('ID')) + if len(pat) == 1: + patterns_exhausted.append(pat) elif isinstance(pat, tuple): attr, re_pat = pat if not attr: attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if re_pat.fullmatch(page.get(a.name) or '')) - pat = (attr, re_pat) + page_attr_patterns[pat_idx] = (attr, re_pat) if re_pat.fullmatch(page.get(attr.name) or ''): ret.append(page if return_divs else page.get('ID')) else: raise ValueError + page_attr_patterns_matched.append(pat) except StopIteration: continue - pat_idx_matched_at_least_once.add(pat_idx) - pat_idx += 1 - if unmatched_idx := page_attr_patterns_idx - pat_idx_matched_at_least_once: - raise ValueError(f"Patterns {[page_attr_patterns_copy[x] for x in unmatched_idx]} match none of the pages") + for p in patterns_exhausted: + page_attr_patterns.remove(p) + unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched ] + if unmatched: + raise ValueError(f"Patterns {unmatched} match none of the pages") return ret assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright From 28a1f1839ab67df6d1eb2741cc357d21cc90b974 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 8 Feb 2024 15:50:29 +0100 Subject: [PATCH 21/22] adapt tests to stricter page pattern matching --- tests/model/test_ocrd_mets.py | 63 ++++++++++----------- tests/processor/test_processor.py | 2 +- tests/test_mets_server.py | 56 ++++++++++++------ tests/validator/test_workspace_validator.py | 6 +- 4 files changed, 75 insertions(+), 52 deletions(-) diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index dbd81f492..8dd5d9047 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -73,42 +73,41 @@ def test_file_groups(sbb_sample_01): def test_find_all_files(sbb_sample_01): - assert len(sbb_sample_01.find_all_files()) == 35, '35 files total' - assert len(sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' - assert len(sbb_sample_01.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' - assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' - assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' - assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' - assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' - # assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' - assert len(sbb_sample_01.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' - assert len(sbb_sample_01.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' - assert len(sbb_sample_01.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE - assert len(sbb_sample_01.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' - assert len(sbb_sample_01.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' - # assert [str(x.ID) for x in sbb_sample_01.find_all_files()] == [str(x.ID) for x in sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')] - #print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')]) - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' - # print([x.ID for x in sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')]) - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' - assert len(sbb_sample_01.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' - assert len(sbb_sample_01.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' - assert len(sbb_sample_01.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' - assert len(sbb_sample_01.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' - assert len(sbb_sample_01.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + mets = sbb_sample_01 + assert len(mets.find_all_files()) == 35, '35 files total' + assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' + assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' + assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' + assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' + assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' + assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' + assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' + assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE + assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' + assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' with pytest.raises(ValueError, match='differ in their non-numeric part'): - len(sbb_sample_01.find_all_files(pageId='1..PHYS_0002')) + len(mets.find_all_files(pageId='1..PHYS_0002')) with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): - sbb_sample_01.find_all_files(pageId='PHYS_0006..PHYS_0029') + mets.find_all_files(pageId='PHYS_0006..PHYS_0029') with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): - sbb_sample_01.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') + mets.find_all_files(pageId='PHYS_0001-NOTEXIST') with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): - sbb_sample_01.find_all_files(pageId='//PHYS000.*') + mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') + with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + mets.find_all_files(pageId='//PHYS000.*') def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index c6d560ba1..be4cc56ed 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -204,7 +204,7 @@ class ZipTestProcessor(Processor): pass ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id=None) ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') - for page_id in [None, 'phys_0001,phys_0002']: + for page_id in [None, 'phys_0001']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 59ee50e08..478331f97 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -1,4 +1,6 @@ +import re from pytest import fixture, raises +import pytest from tests.base import assets from itertools import repeat @@ -186,22 +188,44 @@ def test_mets_server_socket_stop(start_mets_server): def test_find_all_files(start_mets_server): _, workspace_server = start_mets_server - assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' - assert len(workspace_server.mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' - assert len(workspace_server.mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' - assert len(workspace_server.mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' - assert len(workspace_server.mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' - assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' - assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001-NOTEXIST')) == 0, '0 pages for "PHYS_0001-NOTEXIST"' - assert len(workspace_server.mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' - assert len(workspace_server.mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' - assert len(workspace_server.mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE - assert len(workspace_server.mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' - assert len(workspace_server.mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' - assert len(workspace_server.mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' - assert len(workspace_server.mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' - assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' - assert len(workspace_server.mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + mets = workspace_server.mets + assert len(mets.find_all_files()) == 35, '35 files total' + assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + # TODO https://github.com/OCR-D/core/issues/1185 + # assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' + # TODO https://github.com/OCR-D/core/issues/1185 + # assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' + assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' + assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' + assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' + assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' + assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' + assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE + assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' + assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' + assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' + assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' + assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' + # TODO https://github.com/OCR-D/core/issues/1185 + # with pytest.raises(ValueError, match='differ in their non-numeric part'): + # len(mets.find_all_files(pageId='1..PHYS_0002')) + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='PHYS_0006..PHYS_0029') + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='PHYS_0001-NOTEXIST') + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') + # with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): + # mets.find_all_files(pageId='//PHYS000.*') def test_reload(start_mets_server): _, workspace_server = start_mets_server diff --git a/tests/validator/test_workspace_validator.py b/tests/validator/test_workspace_validator.py index 39c03bd84..bc516d5a5 100644 --- a/tests/validator/test_workspace_validator.py +++ b/tests/validator/test_workspace_validator.py @@ -41,20 +41,20 @@ def test_check_file_grp_basic(self): def test_check_file_grp_page_id_str(self): workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) - report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0003,PHYS_0001') + report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0001') self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) self.assertEqual(report.errors[0], "Output fileGrp[@USE='OCR-D-IMG-BIN'] already contains output for page PHYS_0001") def test_check_file_grp_page_id_list(self): workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) - report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0003','PHYS_0001']) + report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id=['PHYS_0001']) self.assertFalse(report.is_valid) self.assertEqual(len(report.errors), 1) def test_check_file_grp_page_id_valid(self): workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) - report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0004') + report = WorkspaceValidator.check_file_grp(workspace, 'OCR-D-IMG', 'OCR-D-IMG-BIN', page_id='PHYS_0005') self.assertTrue(report.is_valid) def test_simple(self): From c6cfe03aae28c1b051fdc0090b0365d6c731f861 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 9 Feb 2024 19:01:25 +0100 Subject: [PATCH 22/22] OcrdMets.get_physical_pages: raise ValueError if range start not matched --- src/ocrd_models/ocrd_mets.py | 31 +++++++++++++++++++++++++------ tests/model/test_ocrd_mets.py | 2 ++ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 4e39bf85d..0c5edf3c0 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -599,11 +599,12 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O """ if for_fileIds is None and for_pageIds is None: return self.physical_pages - log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] page_attr_patterns = [] - for pageId_token in re.split(r',', for_pageIds): + page_attr_patterns_raw = re.split(r',', for_pageIds) + for pageId_token in page_attr_patterns_raw: if pageId_token.startswith(REGEX_PREFIX): page_attr_patterns.append((None, re.compile(pageId_token[REGEX_PREFIX_LEN:]))) elif '..' in pageId_token: @@ -613,6 +614,8 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O page_attr_patterns.append(pageId_token) if not page_attr_patterns: return [] + range_patterns_first_last = [(x[0], x[-1]) if isinstance(x, list) else None for x in page_attr_patterns] + page_attr_patterns_copy = list(page_attr_patterns) if self._cache_flag: for pat in page_attr_patterns: try: @@ -621,8 +624,10 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat in self._page_cache[a]) cache_keys = [pat] elif isinstance(pat, list): - attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] in self._page_cache[a]) + attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x in self._page_cache[a] for x in pat)) cache_keys = [v for v in pat if v in self._page_cache[attr]] + for k in cache_keys: + pat.remove(k) elif isinstance(pat, tuple): _, re_pat = pat attr = next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) for v in self._page_cache[a] if re_pat.fullmatch(v)) @@ -636,7 +641,6 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O except StopIteration: raise ValueError(f"{pat} matches none of the keys of any of the _page_caches.") else: - page_attr_patterns_copy = list(page_attr_patterns) page_attr_patterns_matched = [] for page in self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', @@ -650,7 +654,7 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O patterns_exhausted.append(pat) elif isinstance(pat, list): if not isinstance(pat[0], METS_PAGE_DIV_ATTRIBUTE): - pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if pat[0] == page.get(a.name))) + pat.insert(0, next(a for a in list(METS_PAGE_DIV_ATTRIBUTE) if any(x == page.get(a.name) for x in pat))) attr_val = page.get(pat[0].name) if attr_val in pat: pat.remove(attr_val) @@ -671,9 +675,24 @@ def get_physical_pages(self, for_fileIds : Optional[str] = None, for_pageIds : O continue for p in patterns_exhausted: page_attr_patterns.remove(p) - unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched ] + unmatched = [x for x in page_attr_patterns_copy if x not in page_attr_patterns_matched] if unmatched: raise ValueError(f"Patterns {unmatched} match none of the pages") + + ranges_without_start_match = [] + ranges_without_last_match = [] + for idx, pat in enumerate(page_attr_patterns_copy): + if isinstance(pat, list): + start, last = range_patterns_first_last[idx] + if start in pat: + print(pat, start, last) + ranges_without_start_match.append(page_attr_patterns_raw[idx]) + # if last in pat: + # ranges_without_last_match.append(page_attr_patterns_raw[idx]) + if ranges_without_start_match: + raise ValueError(f"Start of range patterns {ranges_without_start_match} not matched - invalid range") + # if ranges_without_last_match: + # raise ValueError(f"End of range patterns {ranges_without_last_match} not matched - invalid range") return ret assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 8dd5d9047..9634886e1 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -108,6 +108,8 @@ def test_find_all_files(sbb_sample_01): mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): mets.find_all_files(pageId='//PHYS000.*') + with pytest.raises(ValueError, match=re.compile(f'Start of range pattern')): + mets.find_all_files(pageId='PHYS_0000..PHYS_0004') def test_find_all_files_local_only(sbb_sample_01): assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001',