Skip to content

Commit

Permalink
#1, #4, #5: scripts/fn/datainfo_tmx.py, scripts/build-ebooks.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Nov 20, 2021
1 parent 50135f2 commit 6562df5
Show file tree
Hide file tree
Showing 9 changed files with 235 additions and 155 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ data/original/tico19-testset
!README.md
tmp/

## @see scripts/_setup-local-machine.sh
# This is a symbolic link to scripts/ to allow local preview of the site.
docs/scripts
# This is a symbolic link to data/ to allow local preview of the site.
docs/data

scripts/data-external/*
!scripts/data-external/.gitkeep
!scripts/data-external/iso15924__sample.csv
Expand Down Expand Up @@ -49,7 +55,6 @@ Gemfile
Gemfile.lock



### asciidoctor, end -----

# https://github.com/datasets/language-codes
Expand Down
5 changes: 5 additions & 0 deletions docs/eng-Latn/index.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@ TODO:
Public domain datasets of the https://tico-19.github.io[Translation Initiative for COVID-19] on the format HXLTM (Multilingual Terminology in Humanitarian Language Exchange).
== Tables
[%header,format=csv]
|===
include::../scripts/data-info/tico19_tm.csv[]
|===
== Quick explanations
Expand Down
7 changes: 7 additions & 0 deletions scripts/_run-all-data-scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,10 @@ set -e
./scripts/data-hxltm-terminologia.sh

./scripts/data-hxltm-translation-memory-import.sh


# TODOs
# - tables render bad on ebooks, convert then to image and put ifelse
# on .adoc
# - https://stackoverflow.com/questions/26357137/csv-to-image-in-python
# - https://stackoverflow.com/questions/902761/saving-a-numpy-array-as-an-image
63 changes: 59 additions & 4 deletions scripts/_setup-local-machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
# ==============================================================================
set -e

# TODO: move this to some file related to deploy website
PWD_NOW=$(pwd)

if [ ! -f 'Gemfile' ]; then
VAR_Gemfile=$(cat << EOF
source 'https://rubygems.org'
Expand All @@ -46,11 +47,65 @@ EOF
)
echo "$VAR_Gemfile" > Gemfile
bundle install
else
echo 'OK: Gemfile exists'
fi

# TODO:
# .vscode/settings.json
# # {
# # "xml.fileAssociations": [
# # {
# # "pattern": "**/*.tmx",
# # "systemId": "scripts/dtd/tmx14.dtd"
# # }
# # ]
# # }


if [ ! -L './docs/scripts' ]; then
cd './docs/'
echo "Create link on /docs/scripts to top folder, since the docs simulate "
echo "GitHub pages deployment"
ln -s ../scripts/ ./
cd "$PWD_NOW"
else
echo 'OK: ./docs/scripts symlink exists'
fi

if [ ! -L './docs/data' ]; then
cd './docs/'
echo "Create link on /docs/scripts to top folder, since the docs simulate "
echo "GitHub pages deployment"
ln -s ../data/ ./
cd "$PWD_NOW"
else
echo 'OK: ./docs/data symlink exists'
fi

printf "\nTesting if some required software are already installed. "
printf "If something fails, you may not run all software or need some changes\n"


set -x
bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub
git --version

rsync --version

## @see https://hxltm.etica.ai
hxltmcli --version

## @seehttps://github.com/johnkerl/miller
mlr --version

## @see http://xmlsoft.org/; example: sudo apt install libxml2-utils
xmllint --version

## Example: sudo apt install xmlstarlet
xmlstarlet --version
set +x
echo 'Okay'

# bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
# bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub

echo 'OKAY. All working'
34 changes: 34 additions & 0 deletions scripts/build-ebooks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/sh
# ==============================================================================
#
# FILE: build-ebooks.sh
#
# USAGE: ./scripts/build-ebooks.sh
#
# DESCRIPTION: Script NOT related with data generation.
# Create ebooks.
#
# OPTIONS: ---
#
# REQUIREMENTS: - POSIX Shell or better
# - scripts/_setup-local-machine.sh
# - asciidoctor-pdf
# - asciidoctor-epub3
# BUGS: ---
# NOTES: ---
# AUTHORS: Emerson Rocha <rocha[at]ieee.org>
# COLLABORATORS: <@TODO: put additional non-anonymous names here>
# COMPANY: EticaAI
# LICENSE: Public Domain dedication OR Zero-Clause BSD
# SPDX-License-Identifier: Unlicense OR 0BSD
# VERSION: v1.0
# CREATED: 2021-11-20 09:12 UTC
# ==============================================================================

set -e
set -x

bundle exec asciidoctor-pdf -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.pdf
bundle exec asciidoctor-epub3 -v --attribute allow-uri-read=1 --attribute source-highlighter=rouge docs/eng-Latn/index.adoc --out-file docs/tico-19-hxltm_eng-Latn.epub

set +x
76 changes: 38 additions & 38 deletions scripts/data-info/tico19_tm.csv
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
tmx_filename_original_lang,source_lang_original,source_lang_bcp47,target_lang_original,target_lang_bcp47
en-ar,en,en,ar,ar
en-bn,en,en,bn,bn
en-ckb,en,en,ckb,ckb
en-din,en,en,din,din
en-es-LA,en,en,es-LA,es-LA
en-fa,en,en,fa,fa
en-fr,en,en,fr,fr
en-fuv,en,en,fuv,fuv
en-ha,en,en,ha,ha
en-hi,en,en,hi,hi
en-id,en,en,id,id
en-km,en,en,km,km
en-kr,en,en,kr,kr
en-ku,en,en,ku,ku
en-lg,en,en,lg,lg
en-ln,en,en,ln,ln
en-mr,en,en,mr,mr
en-ms,en,en,ms,ms
en-my,en,en,my,my
en-ne,en,en,ne,ne
en-nus,en,en,nus,nus
en-om,en,en,om,om
en-prs,en,en,prs,prs
en-ps,en,en,ps,ps
en-pt-BR,en,en,pt-BR,pt-BR
en-ru,en,en,ru,ru
en-rw,en,en,rw,rw
en-so,en,en,so,so
en-sw,en,en,sw,sw
en-ta,en,en,ta,ta
en-ti,en,en,ti,ti
en-ti_ER,en,en,ti_ER,ti-ER
en-ti_ET,en,en,ti_ET,ti-ET
en-tl,en,en,tl,tl
en-ur,en,en,ur,ur
en-zh,en,en,zh,zh
en-zu,en,en,zu,zu
TICO-19 language pair,Source Language,Source language BCP47,Target language,Target language BCP47,Deterministic language pair
en-ar,en,en,ar,ar,en_ar
en-bn,en,en,bn,bn,en_bn
en-ckb,en,en,ckb,ckb,en_ckb
en-din,en,en,din,din,en_din
en-es-LA,en,en,es-LA,es-419,en_es-419
en-fa,en,en,fa,fa,en_fa
en-fr,en,en,fr,fr,en_fr
en-fuv,en,en,fuv,fuv,en_fuv
en-ha,en,en,ha,ha,en_ha
en-hi,en,en,hi,hi,en_hi
en-id,en,en,id,id,en_id
en-km,en,en,km,km,en_km
en-kr,en,en,kr,kr,en_kr
en-ku,en,en,ku,ku,en_ku
en-lg,en,en,lg,lg,en_lg
en-ln,en,en,ln,ln,en_ln
en-mr,en,en,mr,mr,en_mr
en-ms,en,en,ms,ms,en_ms
en-my,en,en,my,my,en_my
en-ne,en,en,ne,ne,en_ne
en-nus,en,en,nus,nus,en_nus
en-om,en,en,om,om,en_om
en-prs,en,en,prs,prs,en_prs
en-ps,en,en,ps,ps,en_ps
en-pt-BR,en,en,pt-BR,pt-BR,en_pt-BR
en-ru,en,en,ru,ru,en_ru
en-rw,en,en,rw,rw,en_rw
en-so,en,en,so,so,en_so
en-sw,en,en,sw,sw,en_sw
en-ta,en,en,ta,ta,en_ta
en-ti,en,en,ti,ti,en_ti
en-ti_ER,en,en,ti_ER,ti-ER,en_ti-ER
en-ti_ET,en,en,ti_ET,ti-ET,en_ti-ET
en-tl,en,en,tl,tl,en_tl
en-ur,en,en,ur,ur,en_ur
en-zh,en,en,zh,zh,en_zh
en-zu,en,en,zu,zu,en_zu
47 changes: 6 additions & 41 deletions scripts/data-original-prepare-translation-memory.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,47 +154,12 @@ tico19_tmx_extract "en-zh"
tico19_tmx_extract "en-zu"


./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv

# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '\n./scripts/fn_tico19_datainfo_tmx.py "%s" >> scripts/data-info/tico19_tm.csv'

./scripts/fn_tico19_datainfo_tmx.py "en-ar" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-bn" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ckb" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-din" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-es-LA" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-fa" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-fr" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-fuv" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ha" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-hi" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-id" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-km" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-kr" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ku" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-lg" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ln" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-mr" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ms" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-my" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ne" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-nus" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-om" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-prs" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ps" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-pt-BR" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ru" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-rw" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-so" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-sw" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ta" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ti" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ti_ER" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ti_ET" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-tl" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-ur" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-zh" >> scripts/data-info/tico19_tm.csv
./scripts/fn_tico19_datainfo_tmx.py "en-zu" >> scripts/data-info/tico19_tm.csv
# ./scripts/fn_tico19_datainfo_tmx.py "csv-header" > scripts/data-info/tico19_tm.csv

## Generate the parameters with this shell-kung-fu
# find data/original/TM/ -iname all.en-*.zip | grep -E '(en-...?.?.?.?).tmx' --only-matching | sed 's/.tmx//' | grep -v old | sort | xargs printf '%s '
./scripts/fn/datainfo_tmx.py en-ar en-bn en-ckb en-din en-es-LA en-fa en-fr en-fuv en-ha en-hi en-id en-km en-kr en-ku en-lg en-ln en-mr en-ms en-my en-ne en-nus en-om en-prs en-ps en-pt-BR en-ru en-rw en-so en-sw en-ta en-ti en-ti_ER en-ti_ET en-tl en-ur en-zh en-zu > scripts/data-info/tico19_tm.csv


#### scripts/data-info/tico19_tm_twb_initial-language-pairs_source-lang-en.csv ________________
# Save the languages to CSV file to reuse later
Expand Down
80 changes: 80 additions & 0 deletions scripts/fn/datainfo_tmx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/python3
# ==============================================================================
#
# FILE: datainfo_tmx.py
#
# USAGE: ./scripts/fn/datainfo_tmx.py
#
# DESCRIPTION: Quick and hackish way to generate a CSV formated of what
# first version of released TMXs from TICO-19 means
# by the language pair.
#
# OPTIONS: ---
#
# REQUIREMENTS: - python3
# BUGS: ---
# NOTES: ---
# AUTHORS: Emerson Rocha <rocha[at]ieee.org>
# COLLABORATORS: <@TODO: put additional non-anonymous names here>
# COMPANY: EticaAI
# LICENSE: Public Domain dedication OR Zero-Clause BSD
# SPDX-License-Identifier: Unlicense OR 0BSD
# VERSION: v1.0
# CREATED: 2021-11-20 03:26 UTC
# ==============================================================================

import sys

if len(sys.argv) < 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
print('usage: ' + sys.argv[0] + 'xx-yy xx-YY_ZZ xx-JJ-LL')
print('example: ')
print(' ' + sys.argv[0] + ' en-pt-BR en-ti_ER en-es-LA')

sys.exit()

line_items = []
line_items.append('TICO-19 language pair')
line_items.append('Source Language')
line_items.append('Source language BCP47')
line_items.append('Target language')
line_items.append('Target language BCP47')
line_items.append('Deterministic language pair')
print(','.join(line_items))


def fubar(lang):
if lang == 'es-LA':
return 'es-419'
return lang


def tico19_language_row(tico19_lang_convention):
if tico19_lang_convention.find('en-') != -1:
line_items = []
lang_part_original = tico19_lang_convention
lang_part_source_original = 'en'
lang_part_source_bc47 = 'en'
lang_part_target_original = lang_part_original.replace('en-', '')
lang_part_target_bc47 = fubar(
lang_part_target_original.replace('_', '-'))

lang_pair_deterministic = lang_part_source_bc47 + '_' + \
lang_part_target_bc47

line_items.append(lang_part_original)
line_items.append(lang_part_source_original)
line_items.append(lang_part_source_bc47)
line_items.append(lang_part_target_original)
line_items.append(lang_part_target_bc47)
line_items.append(lang_pair_deterministic)
print(','.join(line_items))
return True

raise Exception('Not implemented + [' + tico19_lang_convention + ']')


for i in sys.argv[1:]:
tico19_language_row(i)


sys.exit()
Loading

0 comments on commit 6562df5

Please sign in to comment.