From 3405ffb0f71e4cecb9ec50893d21f2113ffd6ccd Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 31 Dec 2019 10:35:23 +0100 Subject: [PATCH 1/9] Fix order of arguments for transformer script Signed-off-by: Stefan Weil --- bin/ocr-transform.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/ocr-transform.sh b/bin/ocr-transform.sh index 7cc611b..34f543c 100755 --- a/bin/ocr-transform.sh +++ b/bin/ocr-transform.sh @@ -34,7 +34,6 @@ show_version () { main () { local from="$1" to="$2" infile='-' outfile='-' transformer shift 2 - declare -a script_args # Validate parameters if [[ -z "$from" ]];then @@ -50,6 +49,8 @@ main () { fi fi + declare -a script_args + # if [[ "$1" == '--' ]];then script_args+=("${@:2}") @@ -86,8 +87,7 @@ main () { [[ "$outfile" != '-' ]] && script_args=("${script_args[@]}" "-o:$outfile") exec_saxon "${script_args[@]}" else - script_args=("${script_args[@]}" "$infile") - script_args=("${script_args[@]}" "$outfile") + script_args=("$infile" "$outfile" "${script_args[@]}") "$transformer" "${script_args[@]}" fi } From 31e97e68ed8110a702cf1577cf892fd68174a494 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 31 Dec 2019 10:36:40 +0100 Subject: [PATCH 2/9] Fix conversion from PAGE to ALTO Signed-off-by: Stefan Weil --- script/transform/page__alto | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) mode change 120000 => 100755 script/transform/page__alto diff --git a/script/transform/page__alto b/script/transform/page__alto deleted file mode 120000 index 15e16ee..0000000 --- a/script/transform/page__alto +++ /dev/null @@ -1 +0,0 @@ -alto__page \ No newline at end of file diff --git a/script/transform/page__alto b/script/transform/page__alto new file mode 100755 index 0000000..d6b7d1a --- /dev/null +++ b/script/transform/page__alto @@ -0,0 +1,27 @@ +#!/bin/bash + +SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" +JAR="$VENDORDIR/JPageConverter/PageConverter.jar" +INFILE="$1" +OUTFILE="$2" + +if [[ "$1" = "-" ]]; then + INFILE="$(mktemp)" + cat >"$INFILE" +fi + +if [[ "$2" = "-" ]]; then + OUTFILE="$(mktemp)" +fi + +java -jar "$JAR" -neg-coords toZero -source-xml "$INFILE" -target-xml "$OUTFILE" -convert-to ALTO 2>&1 + +if [[ "$1" = "-" ]]; then + rm "$INFILE" +fi + +if [[ "$2" = "-" ]]; then + cat "$OUTFILE" + rm "$OUTFILE" +fi From 0643c13d12ef84c9a0fff59929051727b446a30a Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 31 Dec 2019 10:37:31 +0100 Subject: [PATCH 3/9] Fix conversion from ALTO to PAGE Signed-off-by: Stefan Weil --- script/transform/alto__page | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/script/transform/alto__page b/script/transform/alto__page index 152de35..3776d2a 100755 --- a/script/transform/alto__page +++ b/script/transform/alto__page @@ -1,19 +1,27 @@ -#!/bin/bash -x +#!/bin/bash + SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" JAR="$VENDORDIR/JPageConverter/PageConverter.jar" INFILE="$1" OUTFILE="$2" -is_temp= -if [[ "$2" = "-" ]];then - is_temp=true +if [[ "$1" = "-" ]]; then + INFILE="$(mktemp)" + cat >"$INFILE" +fi + +if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi -java -jar "$JAR" -neg-coords toZero -source-xml "$INFILE" -target-xml "$OUTFILE" +java -jar "$JAR" -neg-coords toZero -source-xml "$INFILE" -target-xml "$OUTFILE" 2>&1 + +if [[ "$1" = "-" ]]; then + rm "$INFILE" +fi -if [[ "$is_temp" = true ]];then +if [[ "$2" = "-" ]]; then cat "$OUTFILE" rm "$OUTFILE" fi From 6505a88c00f1c35b5b3bd53765c5116972df680d Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 31 Dec 2019 16:46:49 +0100 Subject: [PATCH 4/9] Fix conversion from GCV to hOCR Signed-off-by: Stefan Weil --- script/transform/gcv__hocr | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/script/transform/gcv__hocr b/script/transform/gcv__hocr index 95af894..25457b7 100755 --- a/script/transform/gcv__hocr +++ b/script/transform/gcv__hocr @@ -1,4 +1,5 @@ #!/bin/bash + SCRIPTDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" VENDORSCRIPT="$VENDORDIR/gcv2hocr/gcv2hocr" @@ -8,15 +9,22 @@ OUTFILE="$2" WIDTH=2000 HEIGHT=2000 -is_temp= -if [[ "$2" = "-" ]];then - is_temp=true +if [[ "$1" = "-" ]]; then + INFILE="$(mktemp)" + cat >"$INFILE" +fi + +if [[ "$2" = "-" ]]; then OUTFILE="$(mktemp)" fi "$VENDORSCRIPT" "$INFILE" "$OUTFILE" "$WIDTH" "$HEIGHT" -if [[ "$is_temp" = true ]];then +if [[ "$1" = "-" ]]; then + rm "$INFILE" +fi + +if [[ "$2" = "-" ]]; then cat "$OUTFILE" rm "$OUTFILE" fi From 06d037e281978806d1f516b05b2737421437fdb8 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 31 Dec 2019 18:36:27 +0100 Subject: [PATCH 5/9] Add transformation from hOCR to PAGE Signed-off-by: Stefan Weil --- README.md | 3 ++- script/transform/hocr__page | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 120000 script/transform/hocr__page diff --git a/README.md b/README.md index 23396cd..bfb2839 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ Usage: ocr-transform [-dhLv] [ []] [-- Date: Wed, 1 Jan 2020 09:05:22 +0100 Subject: [PATCH 6/9] Add transformation from ABBYY FineReader XML to PAGE Signed-off-by: Stefan Weil --- README.md | 3 ++- script/transform/abbyy__page | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 120000 script/transform/abbyy__page diff --git a/README.md b/README.md index bfb2839..ebd0b90 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,7 @@ Usage: ocr-transform [-dhLv] [ []] [-- Date: Wed, 1 Jan 2020 09:09:27 +0100 Subject: [PATCH 7/9] Add transformation from Google Cloud Vision JSON to PAGE Signed-off-by: Stefan Weil --- README.md | 3 ++- script/transform/gcv__page | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) create mode 120000 script/transform/gcv__page diff --git a/README.md b/README.md index ebd0b90..d178c41 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,7 @@ Usage: ocr-transform [-dhLv] [ []] [-- Date: Wed, 1 Jan 2020 10:38:22 +0100 Subject: [PATCH 8/9] Format output from PageConverter The output is formatted when a Saxon serialization parameter is given on the command line. The web interface automatically uses `!indent=yes`. Signed-off-by: Stefan Weil --- script/transform/alto__page | 7 ++++++- script/transform/page__alto | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/script/transform/alto__page b/script/transform/alto__page index 3776d2a..0d9ceff 100755 --- a/script/transform/alto__page +++ b/script/transform/alto__page @@ -5,6 +5,7 @@ VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" JAR="$VENDORDIR/JPageConverter/PageConverter.jar" INFILE="$1" OUTFILE="$2" +ARGUMENT="$3" if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" @@ -22,6 +23,10 @@ if [[ "$1" = "-" ]]; then fi if [[ "$2" = "-" ]]; then - cat "$OUTFILE" + if [[ -z "$ARGUMENT" ]]; then + cat "$OUTFILE" + else + java -cp "$VENDORDIR/saxon9he.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT" + fi rm "$OUTFILE" fi diff --git a/script/transform/page__alto b/script/transform/page__alto index d6b7d1a..12f063a 100755 --- a/script/transform/page__alto +++ b/script/transform/page__alto @@ -5,6 +5,7 @@ VENDORDIR="$(cd $SCRIPTDIR/../../vendor/; pwd)" JAR="$VENDORDIR/JPageConverter/PageConverter.jar" INFILE="$1" OUTFILE="$2" +ARGUMENT="$3" if [[ "$1" = "-" ]]; then INFILE="$(mktemp)" @@ -22,6 +23,10 @@ if [[ "$1" = "-" ]]; then fi if [[ "$2" = "-" ]]; then - cat "$OUTFILE" + if [[ -z "$ARGUMENT" ]]; then + cat "$OUTFILE" + else + java -cp "$VENDORDIR/saxon9he.jar" net.sf.saxon.Query -s:"$OUTFILE" -qs:/ "$ARGUMENT" + fi rm "$OUTFILE" fi From d1c4477e5826c1c82dbd5548489ac869b28d48a9 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Wed, 1 Jan 2020 23:14:13 +0100 Subject: [PATCH 9/9] Fix debug message It used an undefined macro SAXON_JAR. Signed-off-by: Stefan Weil --- lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib.sh b/lib.sh index 7f276b6..63cd375 100644 --- a/lib.sh +++ b/lib.sh @@ -118,7 +118,7 @@ show_saxon_options () { #{{{ run saxon / xsd-validator (xsdv.sh) # exec_saxon () exec_saxon() { - (( DEBUG > 0 )) && loginfo Executing "java -jar $SAXON_JAR" "$@" + (( DEBUG > 0 )) && loginfo Executing "java -jar $SHAREDIR/vendor/saxon9he.jar" "$@" (( DEBUG > 1 )) && SAXON_ARGS+=('-t') java -jar "$SHAREDIR/vendor/saxon9he.jar" "$@" }