Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocrd_mets: add get_physical_pages(for_pageIds=...) #1063

Merged
merged 25 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1e3e702
ocrd_mets: add get_physical_pages(for_pageIds=...)
bertsky Jun 26, 2023
07a9fe0
ocrd workspace list-page: --page-id option
bertsky Jun 26, 2023
25854c5
ocrd_mets: expose property physical_pages_labels
bertsky Jun 28, 2023
ccb51ce
ocrd workspace list-page: add --output-field, delegating to page labels
bertsky Jun 28, 2023
e181758
get phys pages returns strs or divs
MehmedGIT Jul 4, 2023
26b64c9
merge master and adapt to page-range output changes
kba Jan 15, 2024
073d9b0
update list-page-workspace with @ORDER
kba Jan 15, 2024
e91cf50
add typing info for caches in OcrdMets
kba Jan 15, 2024
c642d04
more complete test workspace for page labelling/partitioning
kba Jan 15, 2024
9dea95f
replace update-page with a cleaner solution based on get_physical_pages
kba Jan 15, 2024
cfd1c91
OcrdMets: extend the _page_cache to include all METS_PAGE_DIV_ATTRIBUTEs
kba Jan 16, 2024
ee8fb69
implement generic page attribute ranges
kba Jan 16, 2024
1427c07
utils.generate_range: raise a ValueError if non-numeric parts differ
kba Jan 17, 2024
c36360d
fix tests
kba Jan 17, 2024
3a60c1f
revert accidental commit to ocrd_utils/pyproject.toml
kba Jan 17, 2024
643d1ef
Merge branch 'master' into ocrd-mets-get-pages-for-pageids
kba Jan 30, 2024
517814b
get_physical_pages: return early if no patterns
kba Jan 30, 2024
1225912
OcrdMets.find_all_files: fix page attr loop
kba Feb 6, 2024
4a25d1e
OcrdMets.get_physical_pages should return IDs if not return_divs
kba Feb 8, 2024
466c61d
OcrdMets.get_physical_pages: Cache the attribute in the non-cached re…
kba Feb 8, 2024
9f84067
OcrdMets.get_physical_pages: raise ValueError if a pattern matches no…
kba Feb 8, 2024
2647831
OcrdMets.get_physical_pages: iterate over pages, then patterns in non…
kba Feb 8, 2024
28a1f18
adapt tests to stricter page pattern matching
kba Feb 8, 2024
c6cfe03
OcrdMets.get_physical_pages: raise ValueError if range start not matched
kba Feb 9, 2024
8e06532
Merge branch 'master' into ocrd-mets-get-pages-for-pageids
kba Feb 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 74 additions & 31 deletions src/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME
from ocrd.decorators import mets_find_options
from . import command_with_replaced_help
from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE


class WorkspaceCtx():
Expand Down Expand Up @@ -419,21 +420,22 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
@workspace_cli.command('find')
@mets_find_options
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
default=['local_filename'],
multiple=True,
type=click.Choice([
'url',
'mimetype',
'page_id',
'pageId',
'file_id',
'ID',
'file_grp',
'fileGrp',
'basename',
'basename_without_extension',
'local_filename',
]))
default=['local_filename'],
show_default=True,
multiple=True,
type=click.Choice([
'url',
'mimetype',
'page_id',
'pageId',
'file_id',
'ID',
'file_grp',
'fileGrp',
'basename',
'basename_without_extension',
'local_filename',
]))
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS")
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
Expand Down Expand Up @@ -596,31 +598,60 @@ def list_groups(ctx):
# ----------------------------------------------------------------------

@workspace_cli.command('list-page')
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
default=['ID'],
show_default=True,
multiple=True,
type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()))
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
@pass_workspace
def list_pages(ctx, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
"""
List physical page IDs

(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
find_kwargs = {}
if page_id_range:
if page_id_range and 'ID' in output_field:
find_kwargs['pageId'] = page_id_range
ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
ret = []

if output_field == ['ID']:
ret = [[x] for x in page_ids]
else:
for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
ret.append([])
for k in output_field:
ret[i].append(page_div.get(k, 'None'))

if numeric_range:
start, end = map(int, numeric_range.split('..'))
ids = ids[start-1:end]
chunks = partition_list(ids, chunk_number, chunk_index)
ret = ret[start-1:end]

chunks = partition_list(ret, chunk_number, chunk_index)
lines = []
if output_format == 'one-per-line':
print("\n".join(["\n".join(chunk) for chunk in chunks]))
for chunk in chunks:
line_strs = []
for entry in chunk:
line_strs.append("\t".join(entry))
lines.append('\n'.join(line_strs))
elif output_format == 'comma-separated':
print("\n".join([",".join(chunk) for chunk in chunks]))
for chunk in chunks:
line_strs = []
for entry in chunk:
line_strs.append("\t".join(entry))
lines.append(','.join(line_strs))
elif output_format == 'json':
print(dumps(chunks))
lines.append(dumps(chunks))
print('\n'.join(lines))

# ----------------------------------------------------------------------
# ocrd workspace get-id
Expand Down Expand Up @@ -657,18 +688,30 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
workspace.save_mets()

@workspace_cli.command('update-page')
@click.option('--order', help="@ORDER attribute for this mets:div", metavar='ORDER')
@click.option('--orderlabel', help="@ORDERLABEL attribute for this mets:div", metavar='ORDERLABEL')
@click.option('--contentids', help="@CONTENTIDS attribute for this mets:div", metavar='ORDERLABEL')
@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True)
@click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER')
@click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
@click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL')
@click.argument('PAGE_ID')
@pass_workspace
def update_page(ctx, order, orderlabel, contentids, page_id):
def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
"""
Update the @ORDER, @ORDERLABEL o @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
Update the @ID, @ORDER, @ORDERLABEL, @LABEL or @CONTENTIDS attributes of the mets:div with @ID=PAGE_ID
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
workspace.mets.update_physical_page_attributes(page_id, order=order, orderlabel=orderlabel, contentids=contentids)
workspace.save_mets()
update_kwargs = {k: v for k, v in attr_value_pairs}
if order:
update_kwargs['ORDER'] = order
if orderlabel:
update_kwargs['ORDERLABEL'] = orderlabel
if contentids:
update_kwargs['CONTENTIDS'] = contentids
try:
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
workspace.save_mets()
except Exception as err:
print(f"Error: {err}")
sys.exit(1)

# ----------------------------------------------------------------------
# ocrd workspace merge
Expand Down
13 changes: 13 additions & 0 deletions src/ocrd_models/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Constants for ocrd_models.
"""
from enum import Enum, auto
from ocrd_utils import resource_string

__all__ = [
Expand All @@ -26,6 +27,7 @@
'TAG_PAGE_TEXTLINE',
'TAG_PAGE_TEXTEQUIV',
'TAG_PAGE_TEXTREGION',
'METS_PAGE_DIV_ATTRIBUTE',
]


Expand Down Expand Up @@ -70,3 +72,14 @@
'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
'Separator', 'Table', 'Text', 'Unknown'
]

class METS_PAGE_DIV_ATTRIBUTE(Enum):
ID = auto()
ORDER = auto()
ORDERLABEL = auto()
LABEL = auto()
CONTENTIDS = auto()

@classmethod
def names(cls):
return [x.name for x in cls]
Loading
Loading