-
Notifications
You must be signed in to change notification settings - Fork 3
/
ocrd-fileformat-transform
executable file
·112 lines (97 loc) · 4.07 KB
/
ocrd-fileformat-transform
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env bash
#set -e
#set -u
set -o pipefail
set -m
# set -x
which ocrd >/dev/null 2>/dev/null || { echo "ocrd not in \$PATH. Panicking"; exit 1; }
SHAREDIR="$(cd "$(dirname "$0")" >/dev/null && pwd)"
SCRIPT_NAME="${0##*/}"
MIMETYPE_PAGE=$(ocrd bashlib constants MIMETYPE_PAGE)
NJOBS=2 # maximum number of concurrent processes (to better utilize CPU, since this is mostly I/O bound)
main () {
# Load ocrd bashlib functions
# shellcheck source=../core/ocrd/bashlib/lib.bash
source $(ocrd bashlib filename)
# Describe calling script to lib.bash
ocrd__wrap "$SHAREDIR/ocrd-tool.json" "$SCRIPT_NAME" "$@"
ocrd__minversion 2.67.0
cd "${ocrd__argv[working_dir]}"
mets_basename=$(basename ${ocrd__argv[mets_file]})
page_id=${ocrd__argv[page_id]:-}
in_file_grp=${ocrd__argv[input_file_grp]}
out_file_grp=${ocrd__argv[output_file_grp]}
overwrite=${ocrd__argv[overwrite]}
# read params
local from_to script_args out_extension out_mimetype
script_args=(${params['script-args']:-})
from_to=(${params['from-to']})
out_extension=${params['ext']}
if [[ -z "$out_extension" ]];then
case "${from_to[1]}" in
page*|alto*) out_extension=".xml" ;;
hocr*) out_extension=".html" ;;
text*) out_extension=".txt" ;;
esac
fi
case "${from_to[1]}" in
alto*) out_mimetype="application/alto+xml" ;;
page*) out_mimetype="$MIMETYPE_PAGE" ;;
hocr*) out_mimetype="text/html" ;;
text) out_mimetype="text/plain" ;;
esac
local results=$(mktemp -d)
for ((n=0; n<${#ocrd__files[*]}; n++)); do
if (($(jobs -p | wc -l) > $NJOBS + 1)); then
wait -n # first wait for next subshell (from other iteration) to finish
fi
(
local in_fpath="$(ocrd__input_file $n local_filename)"
local in_id="$(ocrd__input_file $n ID)"
local in_mimetype="$(ocrd__input_file $n mimetype)"
local in_pageId="$(ocrd__input_file $n pageId)"
local out_id="$(ocrd__input_file $n outputFileId)"
local out_fpath="$out_file_grp/${out_id}.xml"
if [[ "$in_mimetype" =~ ^image/.* ]]; then
ocrd__log debug "ignoring image $in_id ($in_pageId)"
exit # continue
elif ! test -f "${in_fpath#file://}"; then
ocrd__log error "input file \"$in_fpath\" ID=${in_id} (pageId=${in_pageId}) is not on disk"
exit # continue
else
ocrd__log debug "processing input file $in_id ($in_pageId)"
fi
mkdir -p $out_file_grp
# Actual conversion
ocrd__log info "${from_to[0]} --> ${from_to[1]}: input file $in_id ($in_pageId)"
if ! ocr-transform "${from_to[@]}" "$in_fpath" "$out_fpath" -- ${script_args[@]} 2>&1; then
ocrd__log error "exited with status $retval: ocr-transform ${from_to[*]} $in_fpath $out_fpath -- ${script_args[*]}"
exit # continue
elif [ ! -e "$out_fpath" ]; then
ocrd__log error "Transformation exited with return value 0 but no file was written."
exit # continue
fi
# Add the output files
ocrd__log info "Successfully executed: ocr-transform ${from_to[*]} $in_fpath $out_fpath -- ${script_args[*]}"
# Aggregate all METS actions
echo "$in_pageId $out_id $out_fpath" > $results/$BASHPID # no $$ in subshells
)&
done
wait # wait for all remaining jobs to finish
# Write resulting METS at once
declare -a bulk_options
bulk_options=(-G $out_file_grp -m "$out_mimetype")
bulk_options+=(-r '(?P<page>[^ ]*) (?P<fid>[^ ]*) (?P<fpath>.*)')
bulk_options+=(-g '{{ page }}' -i '{{ fid }}' -S '{{ fpath }}')
if [ "$overwrite" = "true" ];then
bulk_options+=(--force)
fi
declare -a workspace_options
workspace_options=( -m "$mets_basename" )
if [[ -n "${ocrd__argv[mets_server_url]}" ]];then
workspace_options+=( -U "${ocrd__argv[mets_server_url]}" )
fi
cat $results/* | sort | ocrd workspace "${workspace_options[@]}" bulk-add "${bulk_options[@]}" -
rm -fr $results
}
main "$@"