Skip to content

Commit

Permalink
Merge pull request #82 from bertsky/use-bashlib-input-files
Browse files Browse the repository at this point in the history
simplify file iterator by delegating to bashlib input-files
  • Loading branch information
bertsky authored Dec 8, 2021
2 parents 9487dfd + 85a9819 commit 9117389
Showing 1 changed file with 9 additions and 54 deletions.
63 changes: 9 additions & 54 deletions ocrd-olena-binarize
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ function main {
# shellcheck source=../core/ocrd/bashlib/lib.bash
source $(ocrd bashlib filename)
ocrd__wrap "$SHAREDIR/ocrd-tool.json" "ocrd-olena-binarize" "$@"
ocrd__minversion 2.13.0
ocrd__minversion 2.29.0

scribo_options=(--enable-negate-output)
case ${params[impl]} in
Expand Down Expand Up @@ -386,66 +386,21 @@ function main {
esac

cd "${ocrd__argv[working_dir]}"
page_id=${ocrd__argv[page_id]:-}
in_file_grp=${ocrd__argv[input_file_grp]}
out_file_grp=${ocrd__argv[output_file_grp]}
mkdir -p $out_file_grp

local IFS=$'\n'
files=($(ocrd workspace find \
${page_id:+-g} ${page_id:-} \
-G $in_file_grp \
-k url \
-k ID \
-k mimetype \
-k pageId \
--download))
page_pages=($(ocrd workspace find \
-G $in_file_grp \
-m $MIMETYPE_PAGE \
-k pageId))
multi_pages=($(ocrd workspace find \
-G $in_file_grp \
-m //image/.* \
-k pageId | sort | uniq -d))
declare -A is_page is_multi
for page in "${page_pages[@]}"; do
is_page[$page]=1
done
for page in "${multi_pages[@]}"; do
is_multi[$page]=1
done
local IFS=$' \t\n'
local n=0 zeros=0000
for csv in "${files[@]}"; do
let n+=1
# Parse comma separated fields
local IFS=$'\t'
local fields=($csv)
local IFS=$' \t\n'

local in_fpath="${fields[0]}"
local in_id="${fields[1]}"
local in_mimetype="${fields[2]}"
local in_pageId="${fields[3]:-}"
for ((n=0; n<${#ocrd__files[*]}; n++)); do
local in_fpath="$(ocrd__input_file $n url)"
local in_id="$(ocrd__input_file $n ID)"
local in_mimetype="$(ocrd__input_file $n mimetype)"
local in_pageId="$(ocrd__input_file $n pageId)"
local out_id="$(ocrd__input_file $n outputFileId)"
local out_fpath="$out_file_grp/${out_id}.xml"

if ! test -f "${in_fpath#file://}"; then
ocrd log error "input file ID=${in_id} (pageId=${in_pageId} MIME=${in_mimetype}) is not on disk"
continue
fi
# fileGrps may contain PAGE and (derived) images
# so if this pageId has a PAGE file, ignore all others
# and otherwise if it has multiple images, raise an error
if ((${is_page[${in_pageId:--}]:-0})); then
test x$in_mimetype != x$MIMETYPE_PAGE && continue
elif ((${is_multi[${in_pageId:--}]:-0})); then
ocrd__raise "No PAGE-XML for page '$in_pageId' in fileGrp '$in_file_grp' but multiple images."
fi
local out_id="${in_id//$in_file_grp/$out_file_grp}"
if [ "x$out_id" = "x$in_id" ]; then
out_id=${out_file_grp}_${zeros:0:$((4-${#n}))}$n
fi
local out_fpath="$out_file_grp/${out_id}.xml"
mkdir -p $out_file_grp

if [ "x${in_mimetype}" = x${MIMETYPE_PAGE} ]; then
ocrd log info "processing PAGE-XML input file $in_id ($in_pageId)"
Expand Down

0 comments on commit 9117389

Please sign in to comment.