From 8c56ee817867358f2a8130372fd914c136bd7a5b Mon Sep 17 00:00:00 2001
From: Chaitanya Narisetty <cnariset@andrew.cmu.edu>
Date: Fri, 6 May 2022 08:59:26 -0400
Subject: [PATCH 1/5] bug fixes in ST recipes

* Change sampling frequency in `fbank.conf` and `pitch.conf` in Covost2 recipe
* In `run.sh`, if language is low resource, then have more speed perturbations. Fix typos for test sets
* In `st.sh`
  * fix directory naming issues to avoid replacement for different language pairs
  * Replace `>>` with `>` to replace previous inference results
  * Fix removing of empty text in stage 4
  * When removing utterance-ID in `ref.trn.org` or `hyp.trn.org`, the current implementation removes all words in parenthesis instead of removing just the utterance-ID from the end of each line. Fixed this by changing `perl -pe 's/\([^\)]+\)//g;'` to `perl -pe 's/\([^\)]+\)$//g;'`
---
 egs2/TEMPLATE/st1/st.sh          | 90 +++++++++++++-------------------
 egs2/covost2/st1/conf/fbank.conf |  2 +-
 egs2/covost2/st1/conf/pitch.conf |  2 +-
 egs2/covost2/st1/run.sh          |  8 +--
 4 files changed, 43 insertions(+), 59 deletions(-)

diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index 18303210f87..e60eacf2cea 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -139,7 +139,6 @@ lm_test_text=    # Text file path of language model evaluation set.
 nlsyms_txt=none  # Non-linguistic symbol list if existing.
 cleaner=none     # Text cleaner.
 g2p=none         # g2p method (needed if token_type=phn).
-lang=noinfo      # The language type of corpus.
 score_opts=                # The options given to sclite scoring
 local_score_opts=          # The options given to local/score.sh.
 st_speech_fold_length=800 # fold_length for speech data during ST training.
@@ -250,7 +249,6 @@ Options:
     --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
     --cleaner       # Text cleaner (default="${cleaner}").
     --g2p           # g2p method (default="${g2p}").
-    --lang          # The language type of corpus (default=${lang}).
     --score_opts             # The options given to sclite scoring (default="{score_opts}").
     --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
     --st_speech_fold_length # fold_length for speech data during ST training (default="${st_speech_fold_length}").
@@ -307,11 +305,7 @@ utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
 [ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
 
 # Check tokenization type
-if [ "${lang}" != noinfo ]; then
-    token_listdir=data/${lang}_token_list
-else
-    token_listdir=data/token_list
-fi
+token_listdir=data/${src_lang}_${tgt_lang}_token_list
 # The tgt bpedir is set for all cases when using bpe
 tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
 tgt_bpeprefix="${tgt_bpedir}"/bpe
@@ -386,10 +380,7 @@ if [ -z "${st_tag}" ]; then
     else
         st_tag="train_${feats_type}"
     fi
-    if [ "${lang}" != noinfo ]; then
-        st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
-    else
-        st_tag+="_${tgt_token_type}_${tgt_case}"
+    st_tag+="_${src_lang}_${tgt_lang}_${tgt_token_type}_${tgt_case}"
     fi
     if [ "${tgt_token_type}" = bpe ]; then
         st_tag+="${tgt_nbpe}"
@@ -408,10 +399,7 @@ if [ -z "${lm_tag}" ]; then
     else
         lm_tag="train"
     fi
-    if [ "${lang}" != noinfo ]; then
-        lm_tag+="_${lang}_${lm_token_type}"
-    else
-        lm_tag+="_${lm_token_type}"
+    lm_tag+="_${tgt_lang}_${lm_token_type}"
     fi
     if [ "${lm_token_type}" = bpe ]; then
         lm_tag+="${tgt_nbpe}"
@@ -424,10 +412,7 @@ fi
 
 # The directory used for collect-stats mode
 if [ -z "${st_stats_dir}" ]; then
-    if [ "${lang}" != noinfo ]; then
-        st_stats_dir="${expdir}/st_stats_${feats_type}_${lang}_${tgt_token_type}"
-    else
-        st_stats_dir="${expdir}/st_stats_${feats_type}_${tgt_token_type}"
+    st_stats_dir="${expdir}/st_stats_${feats_type}_${src_lang}_${tgt_lang}_${tgt_token_type}"
     fi
     if [ "${tgt_token_type}" = bpe ]; then
         st_stats_dir+="${tgt_nbpe}"
@@ -437,10 +422,7 @@ if [ -z "${st_stats_dir}" ]; then
     fi
 fi
 if [ -z "${lm_stats_dir}" ]; then
-    if [ "${lang}" != noinfo ]; then
-        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
-    else
-        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    lm_stats_dir="${expdir}/lm_stats_${tgt_lang}_${lm_token_type}"
     fi
     if [ "${lm_token_type}" = bpe ]; then
         lm_stats_dir+="${tgt_nbpe}"
@@ -540,7 +522,7 @@ if ! "${skip_data_prep}"; then
                 # expand the utt_extra_files for multi-references
                 expand_utt_extra_files=""
                 for extra_file in ${utt_extra_files}; do
-                    # with regex to suuport multi-references
+                    # with regex to support multi-references
                     for single_file in $(ls data/"${dset}"/${extra_file}*); do
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
                         expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
@@ -585,7 +567,7 @@ if ! "${skip_data_prep}"; then
                 # expand the utt_extra_files for multi-references
                 expand_utt_extra_files=""
                 for extra_file in ${utt_extra_files}; do
-                    # with regex to suuport multi-references
+                    # with regex to support multi-references
                     for single_file in $(ls data/"${dset}"/${extra_file}*); do
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
                         expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
@@ -634,7 +616,7 @@ if ! "${skip_data_prep}"; then
                 # expand the utt_extra_files for multi-references
                 expand_utt_extra_files=""
                 for extra_file in ${utt_extra_files}; do
-                    # with regex to suuport multi-references
+                    # with regex to support multi-references
                     for single_file in $(ls data/"${dset}"/${extra_file}*); do
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
                         expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
@@ -717,8 +699,10 @@ if ! "${skip_data_prep}"; then
             fi
 
             # Remove empty text
-            <"${data_feats}/org/${dset}/text" \
-                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+            for utt_extra_file in ${utt_extra_files}; do
+                <"${data_feats}/org/${dset}/${utt_extra_file}" \
+                    awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/${dset}/${utt_extra_file}"
+            done
 
             # fix_data_dir.sh leaves only utts which exist in all files
             utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
@@ -730,7 +714,7 @@ if ! "${skip_data_prep}"; then
         done
 
         # shellcheck disable=SC2002
-        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
     fi
 
     if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -803,10 +787,10 @@ if ! "${skip_data_prep}"; then
 
         # Create word-list for word-LM training
         if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
-            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            log "Generate word level token_list from ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
             ${python} -m espnet2.bin.tokenize_text \
                 --token_type word \
-                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --input "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \
                 --field 2- \
                 --cleaner "${cleaner}" \
                 --g2p "${g2p}" \
@@ -892,7 +876,7 @@ fi
 if ! "${skip_train}"; then
     if "${use_lm}"; then
         if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
 
             _opts=
             if [ -n "${lm_config}" ]; then
@@ -905,9 +889,9 @@ if ! "${skip_train}"; then
             _logdir="${lm_stats_dir}/logdir"
             mkdir -p "${_logdir}"
             # Get the minimum number among ${nj} and the number lines of input files
-            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)")
 
-            key_file="${data_feats}/lm_train.txt"
+            key_file="${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
             split_scps=""
             for n in $(seq ${_nj}); do
                 split_scps+=" ${_logdir}/train.${n}.scp"
@@ -942,7 +926,7 @@ if ! "${skip_train}"; then
                     --non_linguistic_symbols "${nlsyms_txt}" \
                     --cleaner "${cleaner}" \
                     --g2p "${g2p}" \
-                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text" \
                     --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
                     --train_shape_file "${_logdir}/train.JOB.scp" \
                     --valid_shape_file "${_logdir}/dev.JOB.scp" \
@@ -969,7 +953,7 @@ if ! "${skip_train}"; then
 
 
         if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
 
             _opts=
             if [ -n "${lm_config}" ]; then
@@ -987,7 +971,7 @@ if ! "${skip_train}"; then
                 if [ ! -f "${_split_dir}/.done" ]; then
                     rm -f "${_split_dir}/.done"
                     ${python} -m espnet2.bin.split_scps \
-                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --scps "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
                       --num_splits "${num_splits_lm}" \
                       --output_dir "${_split_dir}"
                     touch "${_split_dir}/.done"
@@ -995,12 +979,12 @@ if ! "${skip_train}"; then
                     log "${_split_dir}/.done exists. Spliting is skipped"
                 fi
 
-                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text "
                 _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
                 _opts+="--multiple_iterator true "
 
             else
-                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text "
                 _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
             fi
 
@@ -1073,8 +1057,8 @@ if ! "${skip_train}"; then
     fi
     if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
         if "${use_ngram}"; then
-            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
-            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
             build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
         else
             log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
@@ -1485,19 +1469,19 @@ if ! "${skip_eval}"; then
                     >"${_scoredir}/hyp.trn.org"
             
             # remove utterance id
-            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
-            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+            perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
 
             # detokenizer
             detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
             detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
 
             if [ ${tgt_case} = "tc" ]; then
-                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                echo "Case sensitive BLEU result (single-reference)" > ${_scoredir}/result.tc.txt
                 sacrebleu "${_scoredir}/ref.trn.detok" \
                           -i "${_scoredir}/hyp.trn.detok" \
                           -m bleu chrf ter \
-                          >> ${_scoredir}/result.tc.txt
+                          > ${_scoredir}/result.tc.txt
                 
                 log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
             fi
@@ -1505,11 +1489,11 @@ if ! "${skip_eval}"; then
             # detokenize & remove punctuation except apostrophe
             remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
             remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
-            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            echo "Case insensitive BLEU result (single-reference)" > ${_scoredir}/result.lc.txt
             sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
                       -i "${_scoredir}/hyp.trn.detok.lc.rm" \
                       -m bleu chrf ter \
-                      >> ${_scoredir}/result.lc.txt
+                      > ${_scoredir}/result.lc.txt
             log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
 
             # process multi-references cases
@@ -1532,7 +1516,7 @@ if ! "${skip_eval}"; then
                             >"${_scoredir}/ref.trn.org.${ref_idx}"
                     
                     # 
-                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
                     detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
                     remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
                     case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
@@ -1540,17 +1524,17 @@ if ! "${skip_eval}"; then
                 done
 
                 if [ ${tgt_case} = "tc" ]; then
-                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    echo "Case sensitive BLEU result (multi-references)" > ${_scoredir}/result.tc.txt
                     sacrebleu ${case_sensitive_refs} \
                         -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
-                        >> ${_scoredir}/result.tc.txt
+                        > ${_scoredir}/result.tc.txt
                     log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
                 fi
 
-                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                echo "Case insensitive BLEU result (multi-references)" > ${_scoredir}/result.lc.txt
                 sacrebleu -lc ${case_insensitive_refs} \
                     -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
-                    >> ${_scoredir}/result.lc.txt
+                    > ${_scoredir}/result.lc.txt
                 log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
             fi
         done
@@ -1646,7 +1630,7 @@ EOF
         # shellcheck disable=SC2086
         espnet_model_zoo_upload \
             --file "${packed_model}" \
-            --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
+            --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${src_lang}_${tgt_lang}" \
             --description_file "${st_exp}"/description \
             --creator_name "${_creator_name}" \
             --license "CC-BY-4.0" \
diff --git a/egs2/covost2/st1/conf/fbank.conf b/egs2/covost2/st1/conf/fbank.conf
index d75ddde4df8..75232358639 100644
--- a/egs2/covost2/st1/conf/fbank.conf
+++ b/egs2/covost2/st1/conf/fbank.conf
@@ -1,2 +1,2 @@
---sample-frequency=8000 
+--sample-frequency=16000
 --num-mel-bins=80
diff --git a/egs2/covost2/st1/conf/pitch.conf b/egs2/covost2/st1/conf/pitch.conf
index 926bcfca92a..e959a19d5b8 100644
--- a/egs2/covost2/st1/conf/pitch.conf
+++ b/egs2/covost2/st1/conf/pitch.conf
@@ -1 +1 @@
---sample-frequency=8000
+--sample-frequency=16000
diff --git a/egs2/covost2/st1/run.sh b/egs2/covost2/st1/run.sh
index 5cd66dbaf53..778dc824185 100755
--- a/egs2/covost2/st1/run.sh
+++ b/egs2/covost2/st1/run.sh
@@ -38,7 +38,7 @@ tgt_case=lc.rm
 
 train_set=train.${src_lang}-${tgt_lang}
 train_dev=dev.${src_lang}-${tgt_lang}
-test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
+test_sets="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}"
 
 st_config=conf/train_st.yaml
 inference_config=conf/decode_st.yaml
@@ -75,9 +75,9 @@ if [[ ${is_exist} == false ]]; then
 fi
 
 if [ ${is_low_resource} = true ]; then
-    speed_perturb_factors="0.9 1.0 1.1"
-else
     speed_perturb_factors="0.8 0.9 1.0 1.1 1.2"
+else
+    speed_perturb_factors="0.9 1.0 1.1"
 fi
 
 if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
@@ -108,7 +108,7 @@ fi
     --inference_config "${inference_config}" \
     --train_set "${train_set}" \
     --valid_set "${train_dev}" \
-    --test_sets "${test_set}" \
+    --test_sets "${test_sets}" \
     --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
     --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
     --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"

From eb6dc2d55faac7e62742d0b7791d8f3a991e91d1 Mon Sep 17 00:00:00 2001
From: Chaitanya Narisetty <cnariset@andrew.cmu.edu>
Date: Fri, 6 May 2022 10:08:19 -0400
Subject: [PATCH 2/5] typo fix

---
 egs2/TEMPLATE/st1/st.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index e60eacf2cea..a43e6aef9a9 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -381,7 +381,6 @@ if [ -z "${st_tag}" ]; then
         st_tag="train_${feats_type}"
     fi
     st_tag+="_${src_lang}_${tgt_lang}_${tgt_token_type}_${tgt_case}"
-    fi
     if [ "${tgt_token_type}" = bpe ]; then
         st_tag+="${tgt_nbpe}"
     fi
@@ -400,7 +399,6 @@ if [ -z "${lm_tag}" ]; then
         lm_tag="train"
     fi
     lm_tag+="_${tgt_lang}_${lm_token_type}"
-    fi
     if [ "${lm_token_type}" = bpe ]; then
         lm_tag+="${tgt_nbpe}"
     fi
@@ -413,7 +411,6 @@ fi
 # The directory used for collect-stats mode
 if [ -z "${st_stats_dir}" ]; then
     st_stats_dir="${expdir}/st_stats_${feats_type}_${src_lang}_${tgt_lang}_${tgt_token_type}"
-    fi
     if [ "${tgt_token_type}" = bpe ]; then
         st_stats_dir+="${tgt_nbpe}"
     fi
@@ -423,7 +420,6 @@ if [ -z "${st_stats_dir}" ]; then
 fi
 if [ -z "${lm_stats_dir}" ]; then
     lm_stats_dir="${expdir}/lm_stats_${tgt_lang}_${lm_token_type}"
-    fi
     if [ "${lm_token_type}" = bpe ]; then
         lm_stats_dir+="${tgt_nbpe}"
     fi

From ea44663e8a24ebfcaa03f3bba149e561e970fdf3 Mon Sep 17 00:00:00 2001
From: Chaitanya Narisetty <cnariset@andrew.cmu.edu>
Date: Fri, 13 May 2022 04:43:18 -0400
Subject: [PATCH 3/5] review suggested changes

---
 .../asr1/pyscripts/utils/rotate_logfile.py    | 59 +++++++++++++++++++
 egs2/TEMPLATE/st1/st.sh                       | 18 ++++--
 2 files changed, 71 insertions(+), 6 deletions(-)
 create mode 100755 egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py

diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py b/egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py
new file mode 100755
index 00000000000..e30c7a1e682
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+# Copyright 2022 Chaitanya Narisetty
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Rotate log-file."""
+
+import argparse
+from pathlib import Path
+import shutil
+
+
+def rotate(path, max_num_log_files=1000):
+    """Rotate a log-file while retaining past `max_num_log_files` files.
+    Examples:
+        /some/path/
+        ├──logfile.txt
+        ├──logfile.1.txt
+        ├──logfile.2.txt
+        >>> rotate('/some/path/logfile.txt')
+        /some/path/
+        ├──logfile.1.txt
+        ├──logfile.2.txt
+        ├──logfile.3.txt
+    """
+    for i in range(max_num_log_files - 1, -1, -1):
+        if i == 0:
+            p = Path(path)
+            pn = p.parent / (p.stem + ".1" + p.suffix)
+        else:
+            _p = Path(path)
+            p = _p.parent / (_p.stem + f".{i}" + _p.suffix)
+            pn = _p.parent / (_p.stem + f".{i + 1}" + _p.suffix)
+
+        if p.exists():
+            if i == max_num_log_files - 1:
+                p.unlink()
+            else:
+                shutil.move(p, pn)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "log_filepath", type=str, help="Path to log-file to be rotated."
+    )
+    parser.add_argument(
+        "--max-num-log-files",
+        type=int,
+        help="Maximum number of log-files to be kept.",
+        default=1000,
+    )
+    args = parser.parse_args()
+
+    rotate(args.log_filepath, args.max_num_log_files)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index a43e6aef9a9..bc0f57e4173 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -1472,12 +1472,18 @@ if ! "${skip_eval}"; then
             detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
             detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
 
+            # rotate result files
+            if [ ${tgt_case} = "tc" ]; then
+                pyscripts/utils/rotate_logfile.py ${_scoredir}/result.tc.txt
+            fi
+            pyscripts/utils/rotate_logfile.py ${_scoredir}/result.lc.txt
+
             if [ ${tgt_case} = "tc" ]; then
                 echo "Case sensitive BLEU result (single-reference)" > ${_scoredir}/result.tc.txt
                 sacrebleu "${_scoredir}/ref.trn.detok" \
                           -i "${_scoredir}/hyp.trn.detok" \
                           -m bleu chrf ter \
-                          > ${_scoredir}/result.tc.txt
+                          >> ${_scoredir}/result.tc.txt
                 
                 log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
             fi
@@ -1489,7 +1495,7 @@ if ! "${skip_eval}"; then
             sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
                       -i "${_scoredir}/hyp.trn.detok.lc.rm" \
                       -m bleu chrf ter \
-                      > ${_scoredir}/result.lc.txt
+                      >> ${_scoredir}/result.lc.txt
             log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
 
             # process multi-references cases
@@ -1520,17 +1526,17 @@ if ! "${skip_eval}"; then
                 done
 
                 if [ ${tgt_case} = "tc" ]; then
-                    echo "Case sensitive BLEU result (multi-references)" > ${_scoredir}/result.tc.txt
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
                     sacrebleu ${case_sensitive_refs} \
                         -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
-                        > ${_scoredir}/result.tc.txt
+                        >> ${_scoredir}/result.tc.txt
                     log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
                 fi
 
-                echo "Case insensitive BLEU result (multi-references)" > ${_scoredir}/result.lc.txt
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
                 sacrebleu -lc ${case_insensitive_refs} \
                     -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
-                    > ${_scoredir}/result.lc.txt
+                    >> ${_scoredir}/result.lc.txt
                 log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
             fi
         done

From 3cac7bb7f732a694f4b87007271d394a9ee3838e Mon Sep 17 00:00:00 2001
From: Chaitanya Narisetty <cnariset@andrew.cmu.edu>
Date: Fri, 13 May 2022 05:07:55 -0400
Subject: [PATCH 4/5] resolve conflicts and fix lm_train filenames

---
 egs2/TEMPLATE/st1/st.sh | 65 +++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index bc0f57e4173..03be329cecf 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -483,7 +483,7 @@ if ! "${skip_data_prep}"; then
             done
             utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs}
             for extra_file in ${utt_extra_files}; do
-                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
+                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp
                 mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
             done 
         else
@@ -522,7 +522,7 @@ if ! "${skip_data_prep}"; then
                     for single_file in $(ls data/"${dset}"/${extra_file}*); do
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
                         expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
-                    done 
+                    done
                 done
                 echo "${expand_utt_extra_files}"
                 utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
@@ -567,7 +567,7 @@ if ! "${skip_data_prep}"; then
                     for single_file in $(ls data/"${dset}"/${extra_file}*); do
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
                         expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
-                    done 
+                    done
                 done
                 for extra_file in ${expand_utt_extra_files}; do
                     LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
@@ -616,7 +616,7 @@ if ! "${skip_data_prep}"; then
                     for single_file in $(ls data/"${dset}"/${extra_file}*); do
                         cp ${single_file} "${data_feats}${_suf}/${dset}"
                         expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
-                    done 
+                    done
                 done
                 utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
                 for extra_file in ${expand_utt_extra_files}; do
@@ -706,11 +706,12 @@ if ! "${skip_data_prep}"; then
                 python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
                     > ${data_feats}/${dset}/${utt_extra_file}.tmp 
                 mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
-            done 
+            done
         done
 
         # shellcheck disable=SC2002
-        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' \
+            > "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
     fi
 
     if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -783,10 +784,10 @@ if ! "${skip_data_prep}"; then
 
         # Create word-list for word-LM training
         if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
-            log "Generate word level token_list from ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
+            log "Generate word level token_list from ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
             ${python} -m espnet2.bin.tokenize_text \
                 --token_type word \
-                --input "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \
+                --input "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \
                 --field 2- \
                 --cleaner "${cleaner}" \
                 --g2p "${g2p}" \
@@ -872,7 +873,7 @@ fi
 if ! "${skip_train}"; then
     if "${use_lm}"; then
         if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
 
             _opts=
             if [ -n "${lm_config}" ]; then
@@ -885,9 +886,9 @@ if ! "${skip_train}"; then
             _logdir="${lm_stats_dir}/logdir"
             mkdir -p "${_logdir}"
             # Get the minimum number among ${nj} and the number lines of input files
-            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)")
 
-            key_file="${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
+            key_file="${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
             split_scps=""
             for n in $(seq ${_nj}); do
                 split_scps+=" ${_logdir}/train.${n}.scp"
@@ -911,7 +912,7 @@ if ! "${skip_train}"; then
             log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
             # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
             #       but it's used only for deciding the sample ids.
-            # shellcheck disable=SC2086
+            # shellcheck disable=SC2046,SC2086
             ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
                 ${python} -m espnet2.bin.lm_train \
                     --collect_stats true \
@@ -922,12 +923,12 @@ if ! "${skip_train}"; then
                     --non_linguistic_symbols "${nlsyms_txt}" \
                     --cleaner "${cleaner}" \
                     --g2p "${g2p}" \
-                    --train_data_path_and_name_and_type "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text" \
                     --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
                     --train_shape_file "${_logdir}/train.JOB.scp" \
                     --valid_shape_file "${_logdir}/dev.JOB.scp" \
                     --output_dir "${_logdir}/stats.JOB" \
-                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+                    ${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; }
 
             # 4. Aggregate shape files
             _opts=
@@ -949,7 +950,7 @@ if ! "${skip_train}"; then
 
 
         if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"
 
             _opts=
             if [ -n "${lm_config}" ]; then
@@ -967,7 +968,7 @@ if ! "${skip_train}"; then
                 if [ ! -f "${_split_dir}/.done" ]; then
                     rm -f "${_split_dir}/.done"
                     ${python} -m espnet2.bin.split_scps \
-                      --scps "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --scps "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
                       --num_splits "${num_splits_lm}" \
                       --output_dir "${_split_dir}"
                     touch "${_split_dir}/.done"
@@ -975,12 +976,12 @@ if ! "${skip_train}"; then
                     log "${_split_dir}/.done exists. Spliting is skipped"
                 fi
 
-                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text "
                 _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
                 _opts+="--multiple_iterator true "
 
             else
-                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text "
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text "
                 _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
             fi
 
@@ -1053,9 +1054,9 @@ if ! "${skip_train}"; then
     fi
     if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
         if "${use_ngram}"; then
-            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt"
-            cut -f 2 -d " " ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
-            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
         else
             log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
         fi
@@ -1407,7 +1408,7 @@ if ! "${skip_eval}"; then
 
             # 2. Submit decoding jobs
             log "Decoding started... log: '${_logdir}/st_inference.*.log'"
-            # shellcheck disable=SC2086
+            # shellcheck disable=SC2046,SC2086
             ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
                 ${python} -m ${st_inference_tool} \
                     --batch_size ${batch_size} \
@@ -1417,7 +1418,7 @@ if ! "${skip_eval}"; then
                     --st_train_config "${st_exp}"/config.yaml \
                     --st_model_file "${st_exp}"/"${inference_st_model}" \
                     --output_dir "${_logdir}"/output.JOB \
-                    ${_opts} ${inference_args}
+                    ${_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/st_inference.*.log) ; exit 1; }
 
             # 3. Concatenates the output files from each jobs
             for f in token token_int score text; do
@@ -1463,7 +1464,7 @@ if ! "${skip_eval}"; then
                             ) \
                 <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
                     >"${_scoredir}/hyp.trn.org"
-            
+
             # remove utterance id
             perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
             perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
@@ -1484,7 +1485,7 @@ if ! "${skip_eval}"; then
                           -i "${_scoredir}/hyp.trn.detok" \
                           -m bleu chrf ter \
                           >> ${_scoredir}/result.tc.txt
-                
+
                 log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
             fi
 
@@ -1516,8 +1517,8 @@ if ! "${skip_eval}"; then
                                 ) \
                         <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
                             >"${_scoredir}/ref.trn.org.${ref_idx}"
-                    
-                    # 
+
+                    # remove utterance id
                     perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
                     detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
                     remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
@@ -1653,11 +1654,11 @@ if ! "${skip_upload_hf}"; then
         gitlfs=$(git lfs --version 2> /dev/null || true)
         [ -z "${gitlfs}" ] && \
             log "ERROR: You need to install git-lfs first" && \
-            exit 1             
-  
+            exit 1
+
         dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
         [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
-  
+
         if command -v git &> /dev/null; then
             _creator_name="$(git config user.name)"
             _checkout="git checkout $(git show -s --format=%H)"
@@ -1670,13 +1671,13 @@ if ! "${skip_upload_hf}"; then
         # foo/asr1 -> foo
         _corpus="${_task%/*}"
         _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
-  
+
         # copy files in ${dir_repo}
         unzip -o ${packed_model} -d ${dir_repo}
         # Generate description file
         # shellcheck disable=SC2034
         hf_task=speech-translation
-        # shellcheck disable=SC2034     
+        # shellcheck disable=SC2034
         espnet_task=ST
         # shellcheck disable=SC2034
         task_exp=${st_exp}

From aa5d6ffff67079f2cbe6a7e1eba852e459f0f6a4 Mon Sep 17 00:00:00 2001
From: Chaitanya Narisetty <cnariset@andrew.cmu.edu>
Date: Fri, 13 May 2022 05:15:32 -0400
Subject: [PATCH 5/5] fix lm tag names

---
 egs2/TEMPLATE/st1/st.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
index 03be329cecf..b37cd3c5f22 100755
--- a/egs2/TEMPLATE/st1/st.sh
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -398,7 +398,7 @@ if [ -z "${lm_tag}" ]; then
     else
         lm_tag="train"
     fi
-    lm_tag+="_${tgt_lang}_${lm_token_type}"
+    lm_tag+="_${src_lang}_${tgt_lang}_${lm_token_type}"
     if [ "${lm_token_type}" = bpe ]; then
         lm_tag+="${tgt_nbpe}"
     fi
@@ -419,7 +419,7 @@ if [ -z "${st_stats_dir}" ]; then
     fi
 fi
 if [ -z "${lm_stats_dir}" ]; then
-    lm_stats_dir="${expdir}/lm_stats_${tgt_lang}_${lm_token_type}"
+    lm_stats_dir="${expdir}/lm_stats_${src_lang}_${tgt_lang}_${lm_token_type}"
     if [ "${lm_token_type}" = bpe ]; then
         lm_stats_dir+="${tgt_nbpe}"
     fi
@@ -485,7 +485,7 @@ if ! "${skip_data_prep}"; then
             for extra_file in ${utt_extra_files}; do
                 python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp
                 mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
-            done 
+            done
         else
            log "Skip stage 2: Speed perturbation"
         fi
@@ -704,7 +704,7 @@ if ! "${skip_data_prep}"; then
             utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
             for utt_extra_file in ${utt_extra_files}; do
                 python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
-                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
+                    > ${data_feats}/${dset}/${utt_extra_file}.tmp
                 mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
             done
         done