From 8c56ee817867358f2a8130372fd914c136bd7a5b Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Fri, 6 May 2022 08:59:26 -0400 Subject: [PATCH 1/5] bug fixes in ST recipes * Change sampling frequency in `fbank.conf` and `pitch.conf` in Covost2 recipe * In `run.sh`, if language is low resource, then have more speed perturbations. Fix typos for test sets * In `st.sh` * fix directory naming issues to avoid replacement for different language pairs * Replace `>>` with `>` to replace previous inference results * Fix removing of empty text in stage 4 * When removing utterance-ID in `ref.trn.org` or `hyp.trn.org`, the current implementation removes all words in parenthesis instead of removing just the utterance-ID from the end of each line. Fixed this by changing `perl -pe 's/\([^\)]+\)//g;'` to `perl -pe 's/\([^\)]+\)$//g;'` --- egs2/TEMPLATE/st1/st.sh | 90 +++++++++++++------------------- egs2/covost2/st1/conf/fbank.conf | 2 +- egs2/covost2/st1/conf/pitch.conf | 2 +- egs2/covost2/st1/run.sh | 8 +-- 4 files changed, 43 insertions(+), 59 deletions(-) diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index 18303210f87..e60eacf2cea 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -139,7 +139,6 @@ lm_test_text= # Text file path of language model evaluation set. nlsyms_txt=none # Non-linguistic symbol list if existing. cleaner=none # Text cleaner. g2p=none # g2p method (needed if token_type=phn). -lang=noinfo # The language type of corpus. score_opts= # The options given to sclite scoring local_score_opts= # The options given to local/score.sh. st_speech_fold_length=800 # fold_length for speech data during ST training. @@ -250,7 +249,6 @@ Options: --nlsyms_txt # Non-linguistic symbol list if existing (default="${nlsyms_txt}"). --cleaner # Text cleaner (default="${cleaner}"). --g2p # g2p method (default="${g2p}"). - --lang # The language type of corpus (default=${lang}). --score_opts # The options given to sclite scoring (default="{score_opts}"). --local_score_opts # The options given to local/score.sh (default="{local_score_opts}"). --st_speech_fold_length # fold_length for speech data during ST training (default="${st_speech_fold_length}"). @@ -307,11 +305,7 @@ utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}" [ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}" # Check tokenization type -if [ "${lang}" != noinfo ]; then - token_listdir=data/${lang}_token_list -else - token_listdir=data/token_list -fi +token_listdir=data/${src_lang}_${tgt_lang}_token_list # The tgt bpedir is set for all cases when using bpe tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}" tgt_bpeprefix="${tgt_bpedir}"/bpe @@ -386,10 +380,7 @@ if [ -z "${st_tag}" ]; then else st_tag="train_${feats_type}" fi - if [ "${lang}" != noinfo ]; then - st_tag+="_${lang}_${tgt_token_type}_${tgt_case}" - else - st_tag+="_${tgt_token_type}_${tgt_case}" + st_tag+="_${src_lang}_${tgt_lang}_${tgt_token_type}_${tgt_case}" fi if [ "${tgt_token_type}" = bpe ]; then st_tag+="${tgt_nbpe}" @@ -408,10 +399,7 @@ if [ -z "${lm_tag}" ]; then else lm_tag="train" fi - if [ "${lang}" != noinfo ]; then - lm_tag+="_${lang}_${lm_token_type}" - else - lm_tag+="_${lm_token_type}" + lm_tag+="_${tgt_lang}_${lm_token_type}" fi if [ "${lm_token_type}" = bpe ]; then lm_tag+="${tgt_nbpe}" @@ -424,10 +412,7 @@ fi # The directory used for collect-stats mode if [ -z "${st_stats_dir}" ]; then - if [ "${lang}" != noinfo ]; then - st_stats_dir="${expdir}/st_stats_${feats_type}_${lang}_${tgt_token_type}" - else - st_stats_dir="${expdir}/st_stats_${feats_type}_${tgt_token_type}" + st_stats_dir="${expdir}/st_stats_${feats_type}_${src_lang}_${tgt_lang}_${tgt_token_type}" fi if [ "${tgt_token_type}" = bpe ]; then st_stats_dir+="${tgt_nbpe}" @@ -437,10 +422,7 @@ if [ -z "${st_stats_dir}" ]; then fi fi if [ -z "${lm_stats_dir}" ]; then - if [ "${lang}" != noinfo ]; then - lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}" - else - lm_stats_dir="${expdir}/lm_stats_${lm_token_type}" + lm_stats_dir="${expdir}/lm_stats_${tgt_lang}_${lm_token_type}" fi if [ "${lm_token_type}" = bpe ]; then lm_stats_dir+="${tgt_nbpe}" @@ -540,7 +522,7 @@ if ! "${skip_data_prep}"; then # expand the utt_extra_files for multi-references expand_utt_extra_files="" for extra_file in ${utt_extra_files}; do - # with regex to suuport multi-references + # with regex to support multi-references for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" @@ -585,7 +567,7 @@ if ! "${skip_data_prep}"; then # expand the utt_extra_files for multi-references expand_utt_extra_files="" for extra_file in ${utt_extra_files}; do - # with regex to suuport multi-references + # with regex to support multi-references for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" @@ -634,7 +616,7 @@ if ! "${skip_data_prep}"; then # expand the utt_extra_files for multi-references expand_utt_extra_files="" for extra_file in ${utt_extra_files}; do - # with regex to suuport multi-references + # with regex to support multi-references for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" @@ -717,8 +699,10 @@ if ! "${skip_data_prep}"; then fi # Remove empty text - <"${data_feats}/org/${dset}/text" \ - awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text" + for utt_extra_file in ${utt_extra_files}; do + <"${data_feats}/org/${dset}/${utt_extra_file}" \ + awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/${dset}/${utt_extra_file}" + done # fix_data_dir.sh leaves only utts which exist in all files utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}" @@ -730,7 +714,7 @@ if ! "${skip_data_prep}"; then done # shellcheck disable=SC2002 - cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt" + cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -803,10 +787,10 @@ if ! "${skip_data_prep}"; then # Create word-list for word-LM training if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then - log "Generate word level token_list from ${data_feats}/lm_train.txt" + log "Generate word level token_list from ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" ${python} -m espnet2.bin.tokenize_text \ --token_type word \ - --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \ + --input "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \ --field 2- \ --cleaner "${cleaner}" \ --g2p "${g2p}" \ @@ -892,7 +876,7 @@ fi if ! "${skip_train}"; then if "${use_lm}"; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}" + log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}" _opts= if [ -n "${lm_config}" ]; then @@ -905,9 +889,9 @@ if ! "${skip_train}"; then _logdir="${lm_stats_dir}/logdir" mkdir -p "${_logdir}" # Get the minimum number among ${nj} and the number lines of input files - _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)") + _nj=$(min "${nj}" "$(<${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)") - key_file="${data_feats}/lm_train.txt" + key_file="${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" split_scps="" for n in $(seq ${_nj}); do split_scps+=" ${_logdir}/train.${n}.scp" @@ -942,7 +926,7 @@ if ! "${skip_train}"; then --non_linguistic_symbols "${nlsyms_txt}" \ --cleaner "${cleaner}" \ --g2p "${g2p}" \ - --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \ + --train_data_path_and_name_and_type "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text" \ --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \ --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/dev.JOB.scp" \ @@ -969,7 +953,7 @@ if ! "${skip_train}"; then if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}" + log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}" _opts= if [ -n "${lm_config}" ]; then @@ -987,7 +971,7 @@ if ! "${skip_train}"; then if [ ! -f "${_split_dir}/.done" ]; then rm -f "${_split_dir}/.done" ${python} -m espnet2.bin.split_scps \ - --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \ + --scps "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \ --num_splits "${num_splits_lm}" \ --output_dir "${_split_dir}" touch "${_split_dir}/.done" @@ -995,12 +979,12 @@ if ! "${skip_train}"; then log "${_split_dir}/.done exists. Spliting is skipped" fi - _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text " + _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text " _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} " _opts+="--multiple_iterator true " else - _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text " + _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text " _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} " fi @@ -1073,8 +1057,8 @@ if ! "${skip_train}"; then fi if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then if "${use_ngram}"; then - log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt" - cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa + log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" + cut -f 2 -d " " ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin else log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}" @@ -1485,19 +1469,19 @@ if ! "${skip_eval}"; then >"${_scoredir}/hyp.trn.org" # remove utterance id - perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn" - perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" + perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn" + perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" # detokenizer detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok" detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok" if [ ${tgt_case} = "tc" ]; then - echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt + echo "Case sensitive BLEU result (single-reference)" > ${_scoredir}/result.tc.txt sacrebleu "${_scoredir}/ref.trn.detok" \ -i "${_scoredir}/hyp.trn.detok" \ -m bleu chrf ter \ - >> ${_scoredir}/result.tc.txt + > ${_scoredir}/result.tc.txt log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt" fi @@ -1505,11 +1489,11 @@ if ! "${skip_eval}"; then # detokenize & remove punctuation except apostrophe remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm" remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm" - echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt + echo "Case insensitive BLEU result (single-reference)" > ${_scoredir}/result.lc.txt sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \ -i "${_scoredir}/hyp.trn.detok.lc.rm" \ -m bleu chrf ter \ - >> ${_scoredir}/result.lc.txt + > ${_scoredir}/result.lc.txt log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt" # process multi-references cases @@ -1532,7 +1516,7 @@ if ! "${skip_eval}"; then >"${_scoredir}/ref.trn.org.${ref_idx}" # - perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}" + perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}" detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}" @@ -1540,17 +1524,17 @@ if ! "${skip_eval}"; then done if [ ${tgt_case} = "tc" ]; then - echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt + echo "Case sensitive BLEU result (multi-references)" > ${_scoredir}/result.tc.txt sacrebleu ${case_sensitive_refs} \ -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \ - >> ${_scoredir}/result.tc.txt + > ${_scoredir}/result.tc.txt log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt" fi - echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt + echo "Case insensitive BLEU result (multi-references)" > ${_scoredir}/result.lc.txt sacrebleu -lc ${case_insensitive_refs} \ -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \ - >> ${_scoredir}/result.lc.txt + > ${_scoredir}/result.lc.txt log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt" fi done @@ -1646,7 +1630,7 @@ EOF # shellcheck disable=SC2086 espnet_model_zoo_upload \ --file "${packed_model}" \ - --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \ + --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${src_lang}_${tgt_lang}" \ --description_file "${st_exp}"/description \ --creator_name "${_creator_name}" \ --license "CC-BY-4.0" \ diff --git a/egs2/covost2/st1/conf/fbank.conf b/egs2/covost2/st1/conf/fbank.conf index d75ddde4df8..75232358639 100644 --- a/egs2/covost2/st1/conf/fbank.conf +++ b/egs2/covost2/st1/conf/fbank.conf @@ -1,2 +1,2 @@ ---sample-frequency=8000 +--sample-frequency=16000 --num-mel-bins=80 diff --git a/egs2/covost2/st1/conf/pitch.conf b/egs2/covost2/st1/conf/pitch.conf index 926bcfca92a..e959a19d5b8 100644 --- a/egs2/covost2/st1/conf/pitch.conf +++ b/egs2/covost2/st1/conf/pitch.conf @@ -1 +1 @@ ---sample-frequency=8000 +--sample-frequency=16000 diff --git a/egs2/covost2/st1/run.sh b/egs2/covost2/st1/run.sh index 5cd66dbaf53..778dc824185 100755 --- a/egs2/covost2/st1/run.sh +++ b/egs2/covost2/st1/run.sh @@ -38,7 +38,7 @@ tgt_case=lc.rm train_set=train.${src_lang}-${tgt_lang} train_dev=dev.${src_lang}-${tgt_lang} -test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} " +test_sets="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}" st_config=conf/train_st.yaml inference_config=conf/decode_st.yaml @@ -75,9 +75,9 @@ if [[ ${is_exist} == false ]]; then fi if [ ${is_low_resource} = true ]; then - speed_perturb_factors="0.9 1.0 1.1" -else speed_perturb_factors="0.8 0.9 1.0 1.1 1.2" +else + speed_perturb_factors="0.9 1.0 1.1" fi if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then @@ -108,7 +108,7 @@ fi --inference_config "${inference_config}" \ --train_set "${train_set}" \ --valid_set "${train_dev}" \ - --test_sets "${test_set}" \ + --test_sets "${test_sets}" \ --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \ --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \ --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@" From eb6dc2d55faac7e62742d0b7791d8f3a991e91d1 Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Fri, 6 May 2022 10:08:19 -0400 Subject: [PATCH 2/5] typo fix --- egs2/TEMPLATE/st1/st.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index e60eacf2cea..a43e6aef9a9 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -381,7 +381,6 @@ if [ -z "${st_tag}" ]; then st_tag="train_${feats_type}" fi st_tag+="_${src_lang}_${tgt_lang}_${tgt_token_type}_${tgt_case}" - fi if [ "${tgt_token_type}" = bpe ]; then st_tag+="${tgt_nbpe}" fi @@ -400,7 +399,6 @@ if [ -z "${lm_tag}" ]; then lm_tag="train" fi lm_tag+="_${tgt_lang}_${lm_token_type}" - fi if [ "${lm_token_type}" = bpe ]; then lm_tag+="${tgt_nbpe}" fi @@ -413,7 +411,6 @@ fi # The directory used for collect-stats mode if [ -z "${st_stats_dir}" ]; then st_stats_dir="${expdir}/st_stats_${feats_type}_${src_lang}_${tgt_lang}_${tgt_token_type}" - fi if [ "${tgt_token_type}" = bpe ]; then st_stats_dir+="${tgt_nbpe}" fi @@ -423,7 +420,6 @@ if [ -z "${st_stats_dir}" ]; then fi if [ -z "${lm_stats_dir}" ]; then lm_stats_dir="${expdir}/lm_stats_${tgt_lang}_${lm_token_type}" - fi if [ "${lm_token_type}" = bpe ]; then lm_stats_dir+="${tgt_nbpe}" fi From ea44663e8a24ebfcaa03f3bba149e561e970fdf3 Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Fri, 13 May 2022 04:43:18 -0400 Subject: [PATCH 3/5] review suggested changes --- .../asr1/pyscripts/utils/rotate_logfile.py | 59 +++++++++++++++++++ egs2/TEMPLATE/st1/st.sh | 18 ++++-- 2 files changed, 71 insertions(+), 6 deletions(-) create mode 100755 egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py b/egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py new file mode 100755 index 00000000000..e30c7a1e682 --- /dev/null +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +# Copyright 2022 Chaitanya Narisetty +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Rotate log-file.""" + +import argparse +from pathlib import Path +import shutil + + +def rotate(path, max_num_log_files=1000): + """Rotate a log-file while retaining past `max_num_log_files` files. + Examples: + /some/path/ + ├──logfile.txt + ├──logfile.1.txt + ├──logfile.2.txt + >>> rotate('/some/path/logfile.txt') + /some/path/ + ├──logfile.1.txt + ├──logfile.2.txt + ├──logfile.3.txt + """ + for i in range(max_num_log_files - 1, -1, -1): + if i == 0: + p = Path(path) + pn = p.parent / (p.stem + ".1" + p.suffix) + else: + _p = Path(path) + p = _p.parent / (_p.stem + f".{i}" + _p.suffix) + pn = _p.parent / (_p.stem + f".{i + 1}" + _p.suffix) + + if p.exists(): + if i == max_num_log_files - 1: + p.unlink() + else: + shutil.move(p, pn) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "log_filepath", type=str, help="Path to log-file to be rotated." + ) + parser.add_argument( + "--max-num-log-files", + type=int, + help="Maximum number of log-files to be kept.", + default=1000, + ) + args = parser.parse_args() + + rotate(args.log_filepath, args.max_num_log_files) + + +if __name__ == "__main__": + main() diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index a43e6aef9a9..bc0f57e4173 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -1472,12 +1472,18 @@ if ! "${skip_eval}"; then detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok" detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok" + # rotate result files + if [ ${tgt_case} = "tc" ]; then + pyscripts/utils/rotate_logfile.py ${_scoredir}/result.tc.txt + fi + pyscripts/utils/rotate_logfile.py ${_scoredir}/result.lc.txt + if [ ${tgt_case} = "tc" ]; then echo "Case sensitive BLEU result (single-reference)" > ${_scoredir}/result.tc.txt sacrebleu "${_scoredir}/ref.trn.detok" \ -i "${_scoredir}/hyp.trn.detok" \ -m bleu chrf ter \ - > ${_scoredir}/result.tc.txt + >> ${_scoredir}/result.tc.txt log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt" fi @@ -1489,7 +1495,7 @@ if ! "${skip_eval}"; then sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \ -i "${_scoredir}/hyp.trn.detok.lc.rm" \ -m bleu chrf ter \ - > ${_scoredir}/result.lc.txt + >> ${_scoredir}/result.lc.txt log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt" # process multi-references cases @@ -1520,17 +1526,17 @@ if ! "${skip_eval}"; then done if [ ${tgt_case} = "tc" ]; then - echo "Case sensitive BLEU result (multi-references)" > ${_scoredir}/result.tc.txt + echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt sacrebleu ${case_sensitive_refs} \ -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \ - > ${_scoredir}/result.tc.txt + >> ${_scoredir}/result.tc.txt log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt" fi - echo "Case insensitive BLEU result (multi-references)" > ${_scoredir}/result.lc.txt + echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt sacrebleu -lc ${case_insensitive_refs} \ -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \ - > ${_scoredir}/result.lc.txt + >> ${_scoredir}/result.lc.txt log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt" fi done From 3cac7bb7f732a694f4b87007271d394a9ee3838e Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Fri, 13 May 2022 05:07:55 -0400 Subject: [PATCH 4/5] resolve conflicts and fix lm_train filenames --- egs2/TEMPLATE/st1/st.sh | 65 +++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index bc0f57e4173..03be329cecf 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -483,7 +483,7 @@ if ! "${skip_data_prep}"; then done utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs} for extra_file in ${utt_extra_files}; do - python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp + python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file} done else @@ -522,7 +522,7 @@ if ! "${skip_data_prep}"; then for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" - done + done done echo "${expand_utt_extra_files}" utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}" @@ -567,7 +567,7 @@ if ! "${skip_data_prep}"; then for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" - done + done done for extra_file in ${expand_utt_extra_files}; do LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}" @@ -616,7 +616,7 @@ if ! "${skip_data_prep}"; then for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" - done + done done utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}" for extra_file in ${expand_utt_extra_files}; do @@ -706,11 +706,12 @@ if ! "${skip_data_prep}"; then python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \ > ${data_feats}/${dset}/${utt_extra_file}.tmp mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file} - done + done done # shellcheck disable=SC2002 - cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" + cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' \ + > "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -783,10 +784,10 @@ if ! "${skip_data_prep}"; then # Create word-list for word-LM training if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then - log "Generate word level token_list from ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" + log "Generate word level token_list from ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" ${python} -m espnet2.bin.tokenize_text \ --token_type word \ - --input "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \ + --input "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \ --field 2- \ --cleaner "${cleaner}" \ --g2p "${g2p}" \ @@ -872,7 +873,7 @@ fi if ! "${skip_train}"; then if "${use_lm}"; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}" + log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}" _opts= if [ -n "${lm_config}" ]; then @@ -885,9 +886,9 @@ if ! "${skip_train}"; then _logdir="${lm_stats_dir}/logdir" mkdir -p "${_logdir}" # Get the minimum number among ${nj} and the number lines of input files - _nj=$(min "${nj}" "$(<${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)") + _nj=$(min "${nj}" "$(<${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)") - key_file="${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" + key_file="${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" split_scps="" for n in $(seq ${_nj}); do split_scps+=" ${_logdir}/train.${n}.scp" @@ -911,7 +912,7 @@ if ! "${skip_train}"; then log "LM collect-stats started... log: '${_logdir}/stats.*.log'" # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted, # but it's used only for deciding the sample ids. - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.lm_train \ --collect_stats true \ @@ -922,12 +923,12 @@ if ! "${skip_train}"; then --non_linguistic_symbols "${nlsyms_txt}" \ --cleaner "${cleaner}" \ --g2p "${g2p}" \ - --train_data_path_and_name_and_type "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text" \ + --train_data_path_and_name_and_type "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text" \ --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \ --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/dev.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -949,7 +950,7 @@ if ! "${skip_train}"; then if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}" + log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}" _opts= if [ -n "${lm_config}" ]; then @@ -967,7 +968,7 @@ if ! "${skip_train}"; then if [ ! -f "${_split_dir}/.done" ]; then rm -f "${_split_dir}/.done" ${python} -m espnet2.bin.split_scps \ - --scps "${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \ + --scps "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \ --num_splits "${num_splits_lm}" \ --output_dir "${_split_dir}" touch "${_split_dir}/.done" @@ -975,12 +976,12 @@ if ! "${skip_train}"; then log "${_split_dir}/.done exists. Spliting is skipped" fi - _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text " + _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text " _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} " _opts+="--multiple_iterator true " else - _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt,text,text " + _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text " _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} " fi @@ -1053,9 +1054,9 @@ if ! "${skip_train}"; then fi if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then if "${use_ngram}"; then - log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt" - cut -f 2 -d " " ${data_feats}/lm_train.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa - build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin + log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" + cut -f 2 -d " " ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa + build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin else log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}" fi @@ -1407,7 +1408,7 @@ if ! "${skip_eval}"; then # 2. Submit decoding jobs log "Decoding started... log: '${_logdir}/st_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \ ${python} -m ${st_inference_tool} \ --batch_size ${batch_size} \ @@ -1417,7 +1418,7 @@ if ! "${skip_eval}"; then --st_train_config "${st_exp}"/config.yaml \ --st_model_file "${st_exp}"/"${inference_st_model}" \ --output_dir "${_logdir}"/output.JOB \ - ${_opts} ${inference_args} + ${_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/st_inference.*.log) ; exit 1; } # 3. Concatenates the output files from each jobs for f in token token_int score text; do @@ -1463,7 +1464,7 @@ if ! "${skip_eval}"; then ) \ <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \ >"${_scoredir}/hyp.trn.org" - + # remove utterance id perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn" perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" @@ -1484,7 +1485,7 @@ if ! "${skip_eval}"; then -i "${_scoredir}/hyp.trn.detok" \ -m bleu chrf ter \ >> ${_scoredir}/result.tc.txt - + log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt" fi @@ -1516,8 +1517,8 @@ if ! "${skip_eval}"; then ) \ <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \ >"${_scoredir}/ref.trn.org.${ref_idx}" - - # + + # remove utterance id perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}" detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" @@ -1653,11 +1654,11 @@ if ! "${skip_upload_hf}"; then gitlfs=$(git lfs --version 2> /dev/null || true) [ -z "${gitlfs}" ] && \ log "ERROR: You need to install git-lfs first" && \ - exit 1 - + exit 1 + dir_repo=${expdir}/hf_${hf_repo//"/"/"_"} [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo} - + if command -v git &> /dev/null; then _creator_name="$(git config user.name)" _checkout="git checkout $(git show -s --format=%H)" @@ -1670,13 +1671,13 @@ if ! "${skip_upload_hf}"; then # foo/asr1 -> foo _corpus="${_task%/*}" _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)" - + # copy files in ${dir_repo} unzip -o ${packed_model} -d ${dir_repo} # Generate description file # shellcheck disable=SC2034 hf_task=speech-translation - # shellcheck disable=SC2034 + # shellcheck disable=SC2034 espnet_task=ST # shellcheck disable=SC2034 task_exp=${st_exp} From aa5d6ffff67079f2cbe6a7e1eba852e459f0f6a4 Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Fri, 13 May 2022 05:15:32 -0400 Subject: [PATCH 5/5] fix lm tag names --- egs2/TEMPLATE/st1/st.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index 03be329cecf..b37cd3c5f22 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -398,7 +398,7 @@ if [ -z "${lm_tag}" ]; then else lm_tag="train" fi - lm_tag+="_${tgt_lang}_${lm_token_type}" + lm_tag+="_${src_lang}_${tgt_lang}_${lm_token_type}" if [ "${lm_token_type}" = bpe ]; then lm_tag+="${tgt_nbpe}" fi @@ -419,7 +419,7 @@ if [ -z "${st_stats_dir}" ]; then fi fi if [ -z "${lm_stats_dir}" ]; then - lm_stats_dir="${expdir}/lm_stats_${tgt_lang}_${lm_token_type}" + lm_stats_dir="${expdir}/lm_stats_${src_lang}_${tgt_lang}_${lm_token_type}" if [ "${lm_token_type}" = bpe ]; then lm_stats_dir+="${tgt_nbpe}" fi @@ -485,7 +485,7 @@ if ! "${skip_data_prep}"; then for extra_file in ${utt_extra_files}; do python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file} - done + done else log "Skip stage 2: Speed perturbation" fi @@ -704,7 +704,7 @@ if ! "${skip_data_prep}"; then utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}" for utt_extra_file in ${utt_extra_files}; do python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \ - > ${data_feats}/${dset}/${utt_extra_file}.tmp + > ${data_feats}/${dset}/${utt_extra_file}.tmp mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file} done done