From 6562df5df3736edeb10492bf6cd95f09a18a2249 Mon Sep 17 00:00:00 2001 From: Emerson Rocha Date: Sat, 20 Nov 2021 06:13:31 -0300 Subject: [PATCH] #1, #4, #5: scripts/fn/datainfo_tmx.py, scripts/build-ebooks.sh --- .gitignore | 7 +- docs/eng-Latn/index.adoc | 5 ++ scripts/_run-all-data-scripts.sh | 7 ++ scripts/_setup-local-machine.sh | 63 ++++++++++++++- scripts/build-ebooks.sh | 34 ++++++++ scripts/data-info/tico19_tm.csv | 76 +++++++++--------- ...ata-original-prepare-translation-memory.sh | 47 ++--------- scripts/fn/datainfo_tmx.py | 80 +++++++++++++++++++ scripts/fn_tico19_datainfo_tmx.py | 71 ---------------- 9 files changed, 235 insertions(+), 155 deletions(-) create mode 100755 scripts/build-ebooks.sh create mode 100755 scripts/fn/datainfo_tmx.py delete mode 100755 scripts/fn_tico19_datainfo_tmx.py diff --git a/.gitignore b/.gitignore index d3149f8..87f4d25 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,12 @@ data/original/tico19-testset !README.md tmp/ +## @see scripts/_setup-local-machine.sh +# This is a symbolic link to scripts/ to allow local preview of the site. +docs/scripts +# This is a symbolic link to data/ to allow local preview of the site. +docs/data + scripts/data-external/* !scripts/data-external/.gitkeep !scripts/data-external/iso15924__sample.csv @@ -49,7 +55,6 @@ Gemfile Gemfile.lock - ### asciidoctor, end ----- # https://github.com/datasets/language-codes diff --git a/docs/eng-Latn/index.adoc b/docs/eng-Latn/index.adoc index 70fdfd5..77d016d 100644 --- a/docs/eng-Latn/index.adoc +++ b/docs/eng-Latn/index.adoc @@ -36,7 +36,12 @@ TODO: Public domain datasets of the https://tico-19.github.io[Translation Initiative for COVID-19] on the format HXLTM (Multilingual Terminology in Humanitarian Language Exchange). +== Tables +[%header,format=csv] +|=== +include::../scripts/data-info/tico19_tm.csv[] +|=== == Quick explanations diff --git a/scripts/_run-all-data-scripts.sh b/scripts/_run-all-data-scripts.sh index 422eab8..4a7eb29 100755 --- a/scripts/_run-all-data-scripts.sh +++ b/scripts/_run-all-data-scripts.sh @@ -50,3 +50,10 @@ set -e ./scripts/data-hxltm-terminologia.sh ./scripts/data-hxltm-translation-memory-import.sh + + +# TODOs +# - tables render bad on ebooks, convert then to image and put ifelse +# on .adoc +# - https://stackoverflow.com/questions/26357137/csv-to-image-in-python +# - https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image \ No newline at end of file diff --git a/scripts/_setup-local-machine.sh b/scripts/_setup-local-machine.sh index 5920182..e4524fa 100755 --- a/scripts/_setup-local-machine.sh +++ b/scripts/_setup-local-machine.sh @@ -26,7 +26,8 @@ # ============================================================================== set -e -# TODO: move this to some file related to deploy website +PWD_NOW=$(pwd) + if [ ! -f 'Gemfile' ]; then VAR_Gemfile=$(cat << EOF source 'https://rubygems.org' @@ -46,11 +47,65 @@ EOF ) echo "$VAR_Gemfile" > Gemfile bundle install +else + echo 'OK: Gemfile exists' +fi + +# TODO: +# .vscode/settings.json +# # { +# # "xml.fileAssociations": [ +# # { +# # "pattern": "**/*.tmx", +# # "systemId": "scripts/dtd/tmx14.dtd" +# # } +# # ] +# # } + + +if [ ! -L './docs/scripts' ]; then + cd './docs/' + echo "Create link on /docs/scripts to top folder, since the docs simulate " + echo "GitHub pages deployment" + ln -s ../scripts/ ./ + cd "$PWD_NOW" +else + echo 'OK: ./docs/scripts symlink exists' +fi + +if [ ! -L './docs/data' ]; then + cd './docs/' + echo "Create link on /docs/scripts to top folder, since the docs simulate " + echo "GitHub pages deployment" + ln -s ../data/ ./ + cd "$PWD_NOW" +else + echo 'OK: ./docs/data symlink exists' fi +printf "\nTesting if some required software are already installed. " +printf "If something fails, you may not run all software or need some changes\n" + + set -x -bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf -bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub +git --version + +rsync --version + +## @see https://hxltm.etica.ai +hxltmcli --version + +## @seehttps://github.com/johnkerl/miller +mlr --version +## @see http://xmlsoft.org/; example: sudo apt install libxml2-utils +xmllint --version + +## Example: sudo apt install xmlstarlet +xmlstarlet --version set +x -echo 'Okay' \ No newline at end of file + +# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf +# bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub + +echo 'OKAY. All working' \ No newline at end of file diff --git a/scripts/build-ebooks.sh b/scripts/build-ebooks.sh new file mode 100755 index 0000000..ea50e8b --- /dev/null +++ b/scripts/build-ebooks.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# ============================================================================== +# +# FILE: build-ebooks.sh +# +# USAGE: ./scripts/build-ebooks.sh +# +# DESCRIPTION: Script NOT related with data generation. +# Create ebooks. +# +# OPTIONS: --- +# +# REQUIREMENTS: - POSIX Shell or better +# - scripts/_setup-local-machine.sh +# - asciidoctor-pdf +# - asciidoctor-epub3 +# BUGS: --- +# NOTES: --- +# AUTHORS: Emerson Rocha +# COLLABORATORS: <@TODO: put additional non-anonymous names here> +# COMPANY: EticaAI +# LICENSE: Public Domain dedication OR Zero-Clause BSD +# SPDX-License-Identifier: Unlicense OR 0BSD +# VERSION: v1.0 +# CREATED: 2021-11-20 09:12 UTC +# ============================================================================== + +set -e +set -x + +bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf +bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub + +set +x \ No newline at end of file diff --git a/scripts/data-info/tico19_tm.csv b/scripts/data-info/tico19_tm.csv index c839a95..9f6856d 100644 --- a/scripts/data-info/tico19_tm.csv +++ b/scripts/data-info/tico19_tm.csv @@ -1,38 +1,38 @@ -tmx_filename_original_lang,source_lang_original,source_lang_bcp47,target_lang_original,target_lang_bcp47 -en-ar,en,en,ar,ar -en-bn,en,en,bn,bn -en-ckb,en,en,ckb,ckb -en-din,en,en,din,din -en-es-LA,en,en,es-LA,es-LA -en-fa,en,en,fa,fa -en-fr,en,en,fr,fr -en-fuv,en,en,fuv,fuv -en-ha,en,en,ha,ha -en-hi,en,en,hi,hi -en-id,en,en,id,id -en-km,en,en,km,km -en-kr,en,en,kr,kr -en-ku,en,en,ku,ku -en-lg,en,en,lg,lg -en-ln,en,en,ln,ln -en-mr,en,en,mr,mr -en-ms,en,en,ms,ms -en-my,en,en,my,my -en-ne,en,en,ne,ne -en-nus,en,en,nus,nus -en-om,en,en,om,om -en-prs,en,en,prs,prs -en-ps,en,en,ps,ps -en-pt-BR,en,en,pt-BR,pt-BR -en-ru,en,en,ru,ru -en-rw,en,en,rw,rw -en-so,en,en,so,so -en-sw,en,en,sw,sw -en-ta,en,en,ta,ta -en-ti,en,en,ti,ti -en-ti_ER,en,en,ti_ER,ti-ER -en-ti_ET,en,en,ti_ET,ti-ET -en-tl,en,en,tl,tl -en-ur,en,en,ur,ur -en-zh,en,en,zh,zh -en-zu,en,en,zu,zu +TICO-19 language pair,Source Language,Source language BCP47,Target language,Target language BCP47,Deterministic language pair +en-ar,en,en,ar,ar,en_ar +en-bn,en,en,bn,bn,en_bn +en-ckb,en,en,ckb,ckb,en_ckb +en-din,en,en,din,din,en_din +en-es-LA,en,en,es-LA,es-419,en_es-419 +en-fa,en,en,fa,fa,en_fa +en-fr,en,en,fr,fr,en_fr +en-fuv,en,en,fuv,fuv,en_fuv +en-ha,en,en,ha,ha,en_ha +en-hi,en,en,hi,hi,en_hi +en-id,en,en,id,id,en_id +en-km,en,en,km,km,en_km +en-kr,en,en,kr,kr,en_kr +en-ku,en,en,ku,ku,en_ku +en-lg,en,en,lg,lg,en_lg +en-ln,en,en,ln,ln,en_ln +en-mr,en,en,mr,mr,en_mr +en-ms,en,en,ms,ms,en_ms +en-my,en,en,my,my,en_my +en-ne,en,en,ne,ne,en_ne +en-nus,en,en,nus,nus,en_nus +en-om,en,en,om,om,en_om +en-prs,en,en,prs,prs,en_prs +en-ps,en,en,ps,ps,en_ps +en-pt-BR,en,en,pt-BR,pt-BR,en_pt-BR +en-ru,en,en,ru,ru,en_ru +en-rw,en,en,rw,rw,en_rw +en-so,en,en,so,so,en_so +en-sw,en,en,sw,sw,en_sw +en-ta,en,en,ta,ta,en_ta +en-ti,en,en,ti,ti,en_ti +en-ti_ER,en,en,ti_ER,ti-ER,en_ti-ER +en-ti_ET,en,en,ti_ET,ti-ET,en_ti-ET +en-tl,en,en,tl,tl,en_tl +en-ur,en,en,ur,ur,en_ur +en-zh,en,en,zh,zh,en_zh +en-zu,en,en,zu,zu,en_zu diff --git a/scripts/data-original-prepare-translation-memory.sh b/scripts/data-original-prepare-translation-memory.sh index d690724..b16a9a3 100755 --- a/scripts/data-original-prepare-translation-memory.sh +++ b/scripts/data-original-prepare-translation-memory.sh @@ -154,47 +154,12 @@ tico19_tmx_extract "en-zh" tico19_tmx_extract "en-zu" -./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv - -# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '\n./scripts/fn_tico19_datainfo_tmx.py "%s" >> scripts/data-info/tico19_tm.csv' - -./scripts/fn_tico19_datainfo_tmx.py "en-ar" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-bn" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ckb" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-din" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-es-LA" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-fa" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-fr" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-fuv" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ha" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-hi" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-id" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-km" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-kr" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ku" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-lg" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ln" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-mr" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ms" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-my" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ne" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-nus" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-om" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-prs" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ps" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-pt-BR" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ru" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-rw" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-so" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-sw" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ta" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ti" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ti_ER" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ti_ET" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-tl" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-ur" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-zh" >> scripts/data-info/tico19_tm.csv -./scripts/fn_tico19_datainfo_tmx.py "en-zu" >> scripts/data-info/tico19_tm.csv +# ./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv + +## Generate the parameters with this shell-kung-fu +# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '%s ' +./scripts/fn/datainfo_tmx.py en-ar en-bn en-ckb en-din en-es-LA en-fa en-fr en-fuv en-ha en-hi en-id en-km en-kr en-ku en-lg en-ln en-mr en-ms en-my en-ne en-nus en-om en-prs en-ps en-pt-BR en-ru en-rw en-so en-sw en-ta en-ti en-ti_ER en-ti_ET en-tl en-ur en-zh en-zu > scripts/data-info/tico19_tm.csv + #### scripts/data-info/tico19_tm_twb_initial-language-pairs_source-lang-en.csv ________________ # Save the languages to CSV file to reuse later diff --git a/scripts/fn/datainfo_tmx.py b/scripts/fn/datainfo_tmx.py new file mode 100755 index 0000000..35dc13d --- /dev/null +++ b/scripts/fn/datainfo_tmx.py @@ -0,0 +1,80 @@ +#!/usr/bin/python3 +# ============================================================================== +# +# FILE: datainfo_tmx.py +# +# USAGE: ./scripts/fn/datainfo_tmx.py +# +# DESCRIPTION: Quick and hackish way to generate a CSV formated of what +# first version of released TMXs from TICO-19 means +# by the language pair. +# +# OPTIONS: --- +# +# REQUIREMENTS: - python3 +# BUGS: --- +# NOTES: --- +# AUTHORS: Emerson Rocha +# COLLABORATORS: <@TODO: put additional non-anonymous names here> +# COMPANY: EticaAI +# LICENSE: Public Domain dedication OR Zero-Clause BSD +# SPDX-License-Identifier: Unlicense OR 0BSD +# VERSION: v1.0 +# CREATED: 2021-11-20 03:26 UTC +# ============================================================================== + +import sys + +if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help': + print('usage: ' + sys.argv[0] + 'xx-yy xx-YY_ZZ xx-JJ-LL') + print('example: ') + print(' ' + sys.argv[0] + ' en-pt-BR en-ti_ER en-es-LA') + + sys.exit() + +line_items = [] +line_items.append('TICO-19 language pair') +line_items.append('Source Language') +line_items.append('Source language BCP47') +line_items.append('Target language') +line_items.append('Target language BCP47') +line_items.append('Deterministic language pair') +print(','.join(line_items)) + + +def fubar(lang): + if lang == 'es-LA': + return 'es-419' + return lang + + +def tico19_language_row(tico19_lang_convention): + if tico19_lang_convention.find('en-') != -1: + line_items = [] + lang_part_original = tico19_lang_convention + lang_part_source_original = 'en' + lang_part_source_bc47 = 'en' + lang_part_target_original = lang_part_original.replace('en-', '') + lang_part_target_bc47 = fubar( + lang_part_target_original.replace('_', '-')) + + lang_pair_deterministic = lang_part_source_bc47 + '_' + \ + lang_part_target_bc47 + + line_items.append(lang_part_original) + line_items.append(lang_part_source_original) + line_items.append(lang_part_source_bc47) + line_items.append(lang_part_target_original) + line_items.append(lang_part_target_bc47) + line_items.append(lang_pair_deterministic) + print(','.join(line_items)) + return True + + raise Exception('Not implemented + [' + tico19_lang_convention + ']') + + +for i in sys.argv[1:]: + tico19_language_row(i) + + +sys.exit() diff --git a/scripts/fn_tico19_datainfo_tmx.py b/scripts/fn_tico19_datainfo_tmx.py deleted file mode 100755 index 4f7e23d..0000000 --- a/scripts/fn_tico19_datainfo_tmx.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/python3 -# ============================================================================== -# -# FILE: scripts/fn_tico19_datainfo_tmx.py -# -# USAGE: ./scripts/fn_tico19_datainfo_tmx.py -# -# DESCRIPTION: Hardcoded function to generate data about what should be -# converted based on the whatever whas the file naming -# on the original. -# -# OPTIONS: --- -# -# REQUIREMENTS: - python3 -# BUGS: --- -# NOTES: --- -# AUTHORS: Emerson Rocha -# COLLABORATORS: <@TODO: put additional non-anonymous names here> -# COMPANY: EticaAI -# LICENSE: Public Domain dedication OR Zero-Clause BSD -# SPDX-License-Identifier: Unlicense OR 0BSD -# VERSION: v1.0 -# CREATED: 2021-11-20 03:26 UTC -# ============================================================================== - -import sys - -if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help': - print('usage: ' + sys.argv[0] + 'xx-yy') - print('example: ') - print('example: ') - print(' ' + sys.argv[0] + ' csv-header') - print(' ' + sys.argv[0] + ' en-pt-BR') - print(' ' + sys.argv[0] + ' en-ti_ER') - - sys.exit() - -# print(sys.argv[1].find('en-')) - -if sys.argv[1] == 'csv-header': - line_items = [] - line_items.append('tmx_filename_original_lang') - line_items.append('source_lang_original') - line_items.append('source_lang_bcp47') - line_items.append('target_lang_original') - line_items.append('target_lang_bcp47') - print(','.join(line_items)) - sys.exit() - -if sys.argv[1].find('en-') != -1: - - line_items = [] - lang_part_original = sys.argv[1] - lang_part_source_original = 'en' - lang_part_source_bc47 = 'en' - lang_part_target_original = lang_part_original.replace('en-', '') - lang_part_target_bc47 = lang_part_target_original.replace('_', '-') - - line_items.append(lang_part_original) - line_items.append(lang_part_source_original) - line_items.append(lang_part_source_bc47) - line_items.append(lang_part_target_original) - line_items.append(lang_part_target_bc47) - print(','.join(line_items)) - sys.exit() - - -# The way the filenames was so poor that we will not implement en -# all options for a funcion just to allow a quick metadata info. -# We also will generate full dataset later, so no problem -raise Exception('Not implemented')