From 3ae0dcc9442981274a98ab4cd375e381e4e2b491 Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Mon, 7 Mar 2022 00:17:55 -0500 Subject: [PATCH 1/4] fix joint token --- egs2/TEMPLATE/st1/st.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index 93ffe4d3cf5..94f18eb219d 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -743,6 +743,16 @@ if ! "${skip_data_prep}"; then fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # Combine source and target texts when using joint tokenization + if "${token_joint}"; then + log "Merge src and target data if joint BPE" + + cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang} + [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang} + # Set the new text as the target text + tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}" + fi + # First generate tgt lang if [ "${tgt_token_type}" = bpe ]; then log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang" From 9691479dd83fdc52dc33111939389e1300a65211 Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Mon, 7 Mar 2022 08:14:11 -0500 Subject: [PATCH 2/4] scoring fixes MT and ST --- egs2/TEMPLATE/mt1/mt.sh | 69 +++++++++++++++++++++++++---------------- egs2/TEMPLATE/st1/st.sh | 8 ++--- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh index 6164c155558..35c6ab276c3 100755 --- a/egs2/TEMPLATE/mt1/mt.sh +++ b/egs2/TEMPLATE/mt1/mt.sh @@ -1165,37 +1165,54 @@ if ! "${skip_eval}"; then _scoredir="${_dir}/score_bleu" mkdir -p "${_scoredir}" - paste \ - <(<"${_data}/text.${tgt_case}.${tgt_lang}" \ - ${python} -m espnet2.bin.tokenize_text \ - -f 2- --input - --output - \ - --token_type word \ - --non_linguistic_symbols "${nlsyms_txt}" \ - --remove_non_linguistic_symbols true \ - --cleaner "${cleaner}" \ - ) \ - <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \ - >"${_scoredir}/ref.trn.org" + <"${_data}/text.${tgt_case}.${tgt_lang}" \ + ${python} -m espnet2.bin.tokenize_text \ + -f 2- --input - --output - \ + --token_type word \ + --non_linguistic_symbols "${nlsyms_txt}" \ + --remove_non_linguistic_symbols true \ + --cleaner "${cleaner}" \ + >"${_scoredir}/ref.trn" + + #paste \ + # <(<"${_data}/text.${tgt_case}.${tgt_lang}" \ + # ${python} -m espnet2.bin.tokenize_text \ + # -f 2- --input - --output - \ + # --token_type word \ + # --non_linguistic_symbols "${nlsyms_txt}" \ + # --remove_non_linguistic_symbols true \ + # --cleaner "${cleaner}" \ + # ) \ + # <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \ + # >"${_scoredir}/ref.trn.org" # NOTE(kamo): Don't use cleaner for hyp - paste \ - <(<"${_dir}/text" \ - ${python} -m espnet2.bin.tokenize_text \ - -f 2- --input - --output - \ - --token_type word \ - --non_linguistic_symbols "${nlsyms_txt}" \ - --remove_non_linguistic_symbols true \ - ) \ - <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \ - >"${_scoredir}/hyp.trn.org" + <"${_dir}/text" \ + ${python} -m espnet2.bin.tokenize_text \ + -f 2- --input - --output - \ + --token_type word \ + --non_linguistic_symbols "${nlsyms_txt}" \ + --remove_non_linguistic_symbols true \ + >"${_scoredir}/hyp.trn" + + #paste \ + # <(<"${_dir}/text" \ + # ${python} -m espnet2.bin.tokenize_text \ + # -f 2- --input - --output - \ + # --token_type word \ + # --non_linguistic_symbols "${nlsyms_txt}" \ + # --remove_non_linguistic_symbols true \ + # ) \ + # <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \ + # >"${_scoredir}/hyp.trn.org" # remove utterance id - perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn" - perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" + #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn" + #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" # detokenizer - detokenizer.perl -l en -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok" - detokenizer.perl -l en -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok" + detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok" + detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok" if [ ${tgt_case} = "tc" ]; then echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt @@ -1238,7 +1255,7 @@ if ! "${skip_eval}"; then # perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}" - detokenizer.perl -l en -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" + detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}" case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index 93ffe4d3cf5..696e956a1b7 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -1484,8 +1484,8 @@ if ! "${skip_eval}"; then perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" # detokenizer - detokenizer.perl -l en -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok" - detokenizer.perl -l en -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok" + detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok" + detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok" if [ ${tgt_case} = "tc" ]; then echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt @@ -1528,7 +1528,7 @@ if ! "${skip_eval}"; then # perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}" - detokenizer.perl -l en -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" + detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}" case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" @@ -1551,7 +1551,7 @@ if ! "${skip_eval}"; then done # Show results in Markdown syntax - scripts/utils/show_st_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md + scripts/utils/show_translation_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md cat "${cat_exp}"/RESULTS.md fi else From 041e132a2729ea9e59beed0970f17ccf5834e5ff Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Mon, 7 Mar 2022 11:23:45 -0500 Subject: [PATCH 3/4] fix sos eos --- espnet2/bin/mt_inference.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/espnet2/bin/mt_inference.py b/espnet2/bin/mt_inference.py index 6ce966679a3..e523e1e6d47 100755 --- a/espnet2/bin/mt_inference.py +++ b/espnet2/bin/mt_inference.py @@ -213,7 +213,11 @@ def __call__( assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results - token_int = hyp.yseq[1:-1].tolist() + # token_int = hyp.yseq[1:-1].tolist() + # TODO(sdalmia): check why the above line doesn't work + token_int = hyp.yseq.tolist() + token_int = list(filter(lambda x: x != self.mt_model.sos, token_int)) + token_int = list(filter(lambda x: x != self.mt_model.eos, token_int)) # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) From e2489b1054eb6fd9554ff2f20d62e91b35f1514e Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Mon, 7 Mar 2022 19:06:30 -0500 Subject: [PATCH 4/4] fix token joint in st.sh --- egs2/TEMPLATE/st1/st.sh | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index 94f18eb219d..ddb5d1742a1 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -296,18 +296,8 @@ fi # Extra files for translation process utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}" # Use the same text as ST for bpe training if not specified. -if "${token_joint}"; then - # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model - [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}" - [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}" - - # Prepare data as text.${src_lang}_${tgt_lang}) - cat $src_bpe_train_text $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang} - tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}" -else - [ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}" - [ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}" -fi +[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}" +[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}" # Use the same text as ST for lm training if not specified. [ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}" # Use the same text as ST for lm training if not specified.