#1, #4, #5: scripts/fn/datainfo_tmx.py, scripts/build-ebooks.sh

EticaAI · Nov 20, 2021 · 6562df5 · 6562df5
1 parent 50135f2
commit 6562df5
Show file tree

Hide file tree

Showing 9 changed files with 235 additions and 155 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,12 @@ data/original/tico19-testset
 !README.md
 tmp/
 
+## @see scripts/_setup-local-machine.sh
+#  This is a symbolic link to scripts/ to allow local preview of the site.
+docs/scripts
+#  This is a symbolic link to data/ to allow local preview of the site.
+docs/data
+
 scripts/data-external/*
 !scripts/data-external/.gitkeep
 !scripts/data-external/iso15924__sample.csv
@@ -49,7 +55,6 @@ Gemfile
 Gemfile.lock
 
 
-
 ### asciidoctor, end -----
 
 # https://github.com/datasets/language-codes

diff --git a/docs/eng-Latn/index.adoc b/docs/eng-Latn/index.adoc
@@ -36,7 +36,12 @@ TODO:
 
 Public domain datasets of the https://tico-19.github.io[Translation Initiative for COVID-19] on the format HXLTM (Multilingual Terminology in Humanitarian Language Exchange).
 
+== Tables
 
+[%header,format=csv]
+|===
+include::../scripts/data-info/tico19_tm.csv[]
+|===
 
 == Quick explanations
 

diff --git a/scripts/_run-all-data-scripts.sh b/scripts/_run-all-data-scripts.sh
@@ -50,3 +50,10 @@ set -e
 ./scripts/data-hxltm-terminologia.sh
 
 ./scripts/data-hxltm-translation-memory-import.sh
+
+
+# TODOs
+# - tables render bad on ebooks, convert then to image and put ifelse
+#   on .adoc
+#   - https://stackoverflow.com/questions/26357137/csv-to-image-in-python
+#   - https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image
diff --git a/scripts/_setup-local-machine.sh b/scripts/_setup-local-machine.sh
@@ -26,7 +26,8 @@
 # ==============================================================================
 set -e
 
-# TODO: move this to some file related to deploy website
+PWD_NOW=$(pwd)
+
 if [ ! -f 'Gemfile' ]; then
     VAR_Gemfile=$(cat << EOF
 source 'https://rubygems.org'
@@ -46,11 +47,65 @@ EOF
 )
     echo "$VAR_Gemfile" > Gemfile
     bundle install
+else
+    echo 'OK: Gemfile exists'
+fi
+
+# TODO:
+# .vscode/settings.json
+# # {
+# #     "xml.fileAssociations": [
+# #         {
+# #             "pattern": "**/*.tmx",
+# #             "systemId": "scripts/dtd/tmx14.dtd"
+# #         }
+# #     ]
+# # }
+
+
+if [ ! -L './docs/scripts' ]; then
+    cd './docs/'
+    echo "Create link on /docs/scripts to top folder, since the docs simulate "
+    echo "GitHub pages deployment"
+    ln -s ../scripts/ ./
+    cd "$PWD_NOW"
+else
+    echo 'OK: ./docs/scripts symlink exists'
+fi
+
+if [ ! -L './docs/data' ]; then
+    cd './docs/'
+    echo "Create link on /docs/scripts to top folder, since the docs simulate "
+    echo "GitHub pages deployment"
+    ln -s ../data/ ./
+    cd "$PWD_NOW"
+else
+    echo 'OK: ./docs/data symlink exists'
 fi
 
+printf "\nTesting if some required software are already installed. "
+printf "If something fails, you may not run all software or need some changes\n"
+
+
 set -x
-bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
-bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub
+git --version
+
+rsync --version
+
+## @see https://hxltm.etica.ai
+hxltmcli --version
+
+## @seehttps://github.com/johnkerl/miller
+mlr --version
 
+## @see http://xmlsoft.org/; example: sudo apt  install libxml2-utils
+xmllint --version
+
+## Example: sudo apt  install xmlstarlet
+xmlstarlet --version
 set +x
-echo 'Okay'
+
+# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
+# bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub
+
+echo 'OKAY. All working'
diff --git a/scripts/build-ebooks.sh b/scripts/build-ebooks.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+# ==============================================================================
+#
+#          FILE:  build-ebooks.sh
+#
+#         USAGE:  ./scripts/build-ebooks.sh
+#
+#   DESCRIPTION:  Script NOT related with data generation.
+#                 Create ebooks.
+#
+#       OPTIONS:  ---
+#
+#  REQUIREMENTS:  - POSIX Shell or better
+#                 - scripts/_setup-local-machine.sh
+#                 - asciidoctor-pdf
+#                 - asciidoctor-epub3
+#          BUGS:  ---
+#         NOTES:  ---
+#       AUTHORS:  Emerson Rocha <rocha[at]ieee.org>
+# COLLABORATORS:  <@TODO: put additional non-anonymous names here>
+#       COMPANY:  EticaAI
+#       LICENSE:  Public Domain dedication OR Zero-Clause BSD
+#                 SPDX-License-Identifier: Unlicense OR 0BSD
+#       VERSION:  v1.0
+#       CREATED:  2021-11-20 09:12 UTC
+# ==============================================================================
+
+set -e
+set -x
+
+bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
+bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub
+
+set +x
diff --git a/scripts/data-info/tico19_tm.csv b/scripts/data-info/tico19_tm.csv
@@ -1,38 +1,38 @@
-tmx_filename_original_lang,source_lang_original,source_lang_bcp47,target_lang_original,target_lang_bcp47
-en-ar,en,en,ar,ar
-en-bn,en,en,bn,bn
-en-ckb,en,en,ckb,ckb
-en-din,en,en,din,din
-en-es-LA,en,en,es-LA,es-LA
-en-fa,en,en,fa,fa
-en-fr,en,en,fr,fr
-en-fuv,en,en,fuv,fuv
-en-ha,en,en,ha,ha
-en-hi,en,en,hi,hi
-en-id,en,en,id,id
-en-km,en,en,km,km
-en-kr,en,en,kr,kr
-en-ku,en,en,ku,ku
-en-lg,en,en,lg,lg
-en-ln,en,en,ln,ln
-en-mr,en,en,mr,mr
-en-ms,en,en,ms,ms
-en-my,en,en,my,my
-en-ne,en,en,ne,ne
-en-nus,en,en,nus,nus
-en-om,en,en,om,om
-en-prs,en,en,prs,prs
-en-ps,en,en,ps,ps
-en-pt-BR,en,en,pt-BR,pt-BR
-en-ru,en,en,ru,ru
-en-rw,en,en,rw,rw
-en-so,en,en,so,so
-en-sw,en,en,sw,sw
-en-ta,en,en,ta,ta
-en-ti,en,en,ti,ti
-en-ti_ER,en,en,ti_ER,ti-ER
-en-ti_ET,en,en,ti_ET,ti-ET
-en-tl,en,en,tl,tl
-en-ur,en,en,ur,ur
-en-zh,en,en,zh,zh
-en-zu,en,en,zu,zu
+TICO-19 language pair,Source Language,Source language BCP47,Target language,Target language BCP47,Deterministic language pair
+en-ar,en,en,ar,ar,en_ar
+en-bn,en,en,bn,bn,en_bn
+en-ckb,en,en,ckb,ckb,en_ckb
+en-din,en,en,din,din,en_din
+en-es-LA,en,en,es-LA,es-419,en_es-419
+en-fa,en,en,fa,fa,en_fa
+en-fr,en,en,fr,fr,en_fr
+en-fuv,en,en,fuv,fuv,en_fuv
+en-ha,en,en,ha,ha,en_ha
+en-hi,en,en,hi,hi,en_hi
+en-id,en,en,id,id,en_id
+en-km,en,en,km,km,en_km
+en-kr,en,en,kr,kr,en_kr
+en-ku,en,en,ku,ku,en_ku
+en-lg,en,en,lg,lg,en_lg
+en-ln,en,en,ln,ln,en_ln
+en-mr,en,en,mr,mr,en_mr
+en-ms,en,en,ms,ms,en_ms
+en-my,en,en,my,my,en_my
+en-ne,en,en,ne,ne,en_ne
+en-nus,en,en,nus,nus,en_nus
+en-om,en,en,om,om,en_om
+en-prs,en,en,prs,prs,en_prs
+en-ps,en,en,ps,ps,en_ps
+en-pt-BR,en,en,pt-BR,pt-BR,en_pt-BR
+en-ru,en,en,ru,ru,en_ru
+en-rw,en,en,rw,rw,en_rw
+en-so,en,en,so,so,en_so
+en-sw,en,en,sw,sw,en_sw
+en-ta,en,en,ta,ta,en_ta
+en-ti,en,en,ti,ti,en_ti
+en-ti_ER,en,en,ti_ER,ti-ER,en_ti-ER
+en-ti_ET,en,en,ti_ET,ti-ET,en_ti-ET
+en-tl,en,en,tl,tl,en_tl
+en-ur,en,en,ur,ur,en_ur
+en-zh,en,en,zh,zh,en_zh
+en-zu,en,en,zu,zu,en_zu
diff --git a/scripts/data-original-prepare-translation-memory.sh b/scripts/data-original-prepare-translation-memory.sh
@@ -154,47 +154,12 @@ tico19_tmx_extract "en-zh"
 tico19_tmx_extract "en-zu"
 
 
-./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv
-
-# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '\n./scripts/fn_tico19_datainfo_tmx.py "%s" >> scripts/data-info/tico19_tm.csv'
-
-./scripts/fn_tico19_datainfo_tmx.py "en-ar" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-bn" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ckb" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-din" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-es-LA" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-fa" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-fr" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-fuv" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ha" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-hi" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-id" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-km" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-kr" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ku" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-lg" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ln" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-mr" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ms" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-my" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ne" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-nus" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-om" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-prs" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ps" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-pt-BR" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ru" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-rw" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-so" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-sw" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ta" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ti" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ti_ER" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ti_ET" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-tl" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-ur" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-zh" >> scripts/data-info/tico19_tm.csv
-./scripts/fn_tico19_datainfo_tmx.py "en-zu" >> scripts/data-info/tico19_tm.csv
+# ./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv
+
+## Generate the parameters with this shell-kung-fu
+# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '%s '
+./scripts/fn/datainfo_tmx.py en-ar en-bn en-ckb en-din en-es-LA en-fa en-fr en-fuv en-ha en-hi en-id en-km en-kr en-ku en-lg en-ln en-mr en-ms en-my en-ne en-nus en-om en-prs en-ps en-pt-BR en-ru en-rw en-so en-sw en-ta en-ti en-ti_ER en-ti_ET en-tl en-ur en-zh en-zu > scripts/data-info/tico19_tm.csv
+
 
 #### scripts/data-info/tico19_tm_twb_initial-language-pairs_source-lang-en.csv ________________
 # Save the languages to CSV file to reuse later

diff --git a/scripts/fn/datainfo_tmx.py b/scripts/fn/datainfo_tmx.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python3
+# ==============================================================================
+#
+#          FILE:  datainfo_tmx.py
+#
+#         USAGE:  ./scripts/fn/datainfo_tmx.py
+#
+#   DESCRIPTION: Quick and hackish way to generate a CSV formated of what
+#                first version of released TMXs from TICO-19 means
+#                by the language pair.
+#
+#       OPTIONS:  ---
+#
+#  REQUIREMENTS:  - python3
+#          BUGS:  ---
+#         NOTES:  ---
+#       AUTHORS:  Emerson Rocha <rocha[at]ieee.org>
+# COLLABORATORS:  <@TODO: put additional non-anonymous names here>
+#       COMPANY:  EticaAI
+#       LICENSE:  Public Domain dedication OR Zero-Clause BSD
+#                 SPDX-License-Identifier: Unlicense OR 0BSD
+#       VERSION:  v1.0
+#       CREATED:  2021-11-20 03:26 UTC
+# ==============================================================================
+
+import sys
+
+if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
+    print('usage: ' + sys.argv[0] + 'xx-yy xx-YY_ZZ xx-JJ-LL')
+    print('example: ')
+    print('         ' + sys.argv[0] + ' en-pt-BR en-ti_ER en-es-LA')
+
+    sys.exit()
+
+line_items = []
+line_items.append('TICO-19 language pair')
+line_items.append('Source Language')
+line_items.append('Source language BCP47')
+line_items.append('Target language')
+line_items.append('Target language BCP47')
+line_items.append('Deterministic language pair')
+print(','.join(line_items))
+
+
+def fubar(lang):
+    if lang == 'es-LA':
+        return 'es-419'
+    return lang
+
+
+def tico19_language_row(tico19_lang_convention):
+    if tico19_lang_convention.find('en-') != -1:
+        line_items = []
+        lang_part_original = tico19_lang_convention
+        lang_part_source_original = 'en'
+        lang_part_source_bc47 = 'en'
+        lang_part_target_original = lang_part_original.replace('en-', '')
+        lang_part_target_bc47 = fubar(
+            lang_part_target_original.replace('_', '-'))
+
+        lang_pair_deterministic = lang_part_source_bc47 + '_' + \
+            lang_part_target_bc47
+
+        line_items.append(lang_part_original)
+        line_items.append(lang_part_source_original)
+        line_items.append(lang_part_source_bc47)
+        line_items.append(lang_part_target_original)
+        line_items.append(lang_part_target_bc47)
+        line_items.append(lang_pair_deterministic)
+        print(','.join(line_items))
+        return True
+
+    raise Exception('Not implemented + [' + tico19_lang_convention + ']')
+
+
+for i in sys.argv[1:]:
+    tico19_language_row(i)
+
+
+sys.exit()