Skip to content

Commit

Permalink
Merge pull request espnet#4341 from chintu619/st_bugfix
Browse files Browse the repository at this point in the history
bug fixes in ST recipes
  • Loading branch information
ftshijt authored May 18, 2022
2 parents c173c30 + aa5d6ff commit 3c96908
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 58 deletions.
59 changes: 59 additions & 0 deletions egs2/TEMPLATE/asr1/pyscripts/utils/rotate_logfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python

# Copyright 2022 Chaitanya Narisetty
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

"""Rotate log-file."""

import argparse
from pathlib import Path
import shutil


def rotate(path, max_num_log_files=1000):
"""Rotate a log-file while retaining past `max_num_log_files` files.
Examples:
/some/path/
├──logfile.txt
├──logfile.1.txt
├──logfile.2.txt
>>> rotate('/some/path/logfile.txt')
/some/path/
├──logfile.1.txt
├──logfile.2.txt
├──logfile.3.txt
"""
for i in range(max_num_log_files - 1, -1, -1):
if i == 0:
p = Path(path)
pn = p.parent / (p.stem + ".1" + p.suffix)
else:
_p = Path(path)
p = _p.parent / (_p.stem + f".{i}" + _p.suffix)
pn = _p.parent / (_p.stem + f".{i + 1}" + _p.suffix)

if p.exists():
if i == max_num_log_files - 1:
p.unlink()
else:
shutil.move(p, pn)


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"log_filepath", type=str, help="Path to log-file to be rotated."
)
parser.add_argument(
"--max-num-log-files",
type=int,
help="Maximum number of log-files to be kept.",
default=1000,
)
args = parser.parse_args()

rotate(args.log_filepath, args.max_num_log_files)


if __name__ == "__main__":
main()
91 changes: 39 additions & 52 deletions egs2/TEMPLATE/st1/st.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ lm_test_text= # Text file path of language model evaluation set.
nlsyms_txt=none # Non-linguistic symbol list if existing.
cleaner=none # Text cleaner.
g2p=none # g2p method (needed if token_type=phn).
lang=noinfo # The language type of corpus.
score_opts= # The options given to sclite scoring
local_score_opts= # The options given to local/score.sh.
st_speech_fold_length=800 # fold_length for speech data during ST training.
Expand Down Expand Up @@ -250,7 +249,6 @@ Options:
--nlsyms_txt # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
--cleaner # Text cleaner (default="${cleaner}").
--g2p # g2p method (default="${g2p}").
--lang # The language type of corpus (default=${lang}).
--score_opts # The options given to sclite scoring (default="{score_opts}").
--local_score_opts # The options given to local/score.sh (default="{local_score_opts}").
--st_speech_fold_length # fold_length for speech data during ST training (default="${st_speech_fold_length}").
Expand Down Expand Up @@ -307,11 +305,7 @@ utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"

# Check tokenization type
if [ "${lang}" != noinfo ]; then
token_listdir=data/${lang}_token_list
else
token_listdir=data/token_list
fi
token_listdir=data/${src_lang}_${tgt_lang}_token_list
# The tgt bpedir is set for all cases when using bpe
tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
tgt_bpeprefix="${tgt_bpedir}"/bpe
Expand Down Expand Up @@ -386,11 +380,7 @@ if [ -z "${st_tag}" ]; then
else
st_tag="train_${feats_type}"
fi
if [ "${lang}" != noinfo ]; then
st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
else
st_tag+="_${tgt_token_type}_${tgt_case}"
fi
st_tag+="_${src_lang}_${tgt_lang}_${tgt_token_type}_${tgt_case}"
if [ "${tgt_token_type}" = bpe ]; then
st_tag+="${tgt_nbpe}"
fi
Expand All @@ -408,11 +398,7 @@ if [ -z "${lm_tag}" ]; then
else
lm_tag="train"
fi
if [ "${lang}" != noinfo ]; then
lm_tag+="_${lang}_${lm_token_type}"
else
lm_tag+="_${lm_token_type}"
fi
lm_tag+="_${src_lang}_${tgt_lang}_${lm_token_type}"
if [ "${lm_token_type}" = bpe ]; then
lm_tag+="${tgt_nbpe}"
fi
Expand All @@ -424,11 +410,7 @@ fi

# The directory used for collect-stats mode
if [ -z "${st_stats_dir}" ]; then
if [ "${lang}" != noinfo ]; then
st_stats_dir="${expdir}/st_stats_${feats_type}_${lang}_${tgt_token_type}"
else
st_stats_dir="${expdir}/st_stats_${feats_type}_${tgt_token_type}"
fi
st_stats_dir="${expdir}/st_stats_${feats_type}_${src_lang}_${tgt_lang}_${tgt_token_type}"
if [ "${tgt_token_type}" = bpe ]; then
st_stats_dir+="${tgt_nbpe}"
fi
Expand All @@ -437,11 +419,7 @@ if [ -z "${st_stats_dir}" ]; then
fi
fi
if [ -z "${lm_stats_dir}" ]; then
if [ "${lang}" != noinfo ]; then
lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
else
lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
fi
lm_stats_dir="${expdir}/lm_stats_${src_lang}_${tgt_lang}_${lm_token_type}"
if [ "${lm_token_type}" = bpe ]; then
lm_stats_dir+="${tgt_nbpe}"
fi
Expand Down Expand Up @@ -540,7 +518,7 @@ if ! "${skip_data_prep}"; then
# expand the utt_extra_files for multi-references
expand_utt_extra_files=""
for extra_file in ${utt_extra_files}; do
# with regex to suuport multi-references
# with regex to support multi-references
for single_file in $(ls data/"${dset}"/${extra_file}*); do
cp ${single_file} "${data_feats}${_suf}/${dset}"
expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
Expand Down Expand Up @@ -585,7 +563,7 @@ if ! "${skip_data_prep}"; then
# expand the utt_extra_files for multi-references
expand_utt_extra_files=""
for extra_file in ${utt_extra_files}; do
# with regex to suuport multi-references
# with regex to support multi-references
for single_file in $(ls data/"${dset}"/${extra_file}*); do
cp ${single_file} "${data_feats}${_suf}/${dset}"
expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
Expand Down Expand Up @@ -634,7 +612,7 @@ if ! "${skip_data_prep}"; then
# expand the utt_extra_files for multi-references
expand_utt_extra_files=""
for extra_file in ${utt_extra_files}; do
# with regex to suuport multi-references
# with regex to support multi-references
for single_file in $(ls data/"${dset}"/${extra_file}*); do
cp ${single_file} "${data_feats}${_suf}/${dset}"
expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
Expand Down Expand Up @@ -717,8 +695,10 @@ if ! "${skip_data_prep}"; then
fi

# Remove empty text
<"${data_feats}/org/${dset}/text" \
awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
for utt_extra_file in ${utt_extra_files}; do
<"${data_feats}/org/${dset}/${utt_extra_file}" \
awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/${dset}/${utt_extra_file}"
done

# fix_data_dir.sh leaves only utts which exist in all files
utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
Expand All @@ -730,7 +710,8 @@ if ! "${skip_data_prep}"; then
done

# shellcheck disable=SC2002
cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' \
> "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
fi

if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
Expand Down Expand Up @@ -803,10 +784,10 @@ if ! "${skip_data_prep}"; then

# Create word-list for word-LM training
if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
log "Generate word level token_list from ${data_feats}/lm_train.txt"
log "Generate word level token_list from ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
${python} -m espnet2.bin.tokenize_text \
--token_type word \
--input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
--input "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" --output "${lm_token_list}" \
--field 2- \
--cleaner "${cleaner}" \
--g2p "${g2p}" \
Expand Down Expand Up @@ -892,7 +873,7 @@ fi
if ! "${skip_train}"; then
if "${use_lm}"; then
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"

_opts=
if [ -n "${lm_config}" ]; then
Expand All @@ -905,9 +886,9 @@ if ! "${skip_train}"; then
_logdir="${lm_stats_dir}/logdir"
mkdir -p "${_logdir}"
# Get the minimum number among ${nj} and the number lines of input files
_nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
_nj=$(min "${nj}" "$(<${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt wc -l)" "$(<${lm_dev_text} wc -l)")

key_file="${data_feats}/lm_train.txt"
key_file="${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
split_scps=""
for n in $(seq ${_nj}); do
split_scps+=" ${_logdir}/train.${n}.scp"
Expand Down Expand Up @@ -942,7 +923,7 @@ if ! "${skip_train}"; then
--non_linguistic_symbols "${nlsyms_txt}" \
--cleaner "${cleaner}" \
--g2p "${g2p}" \
--train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
--train_data_path_and_name_and_type "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text" \
--valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
--train_shape_file "${_logdir}/train.JOB.scp" \
--valid_shape_file "${_logdir}/dev.JOB.scp" \
Expand All @@ -969,7 +950,7 @@ if ! "${skip_train}"; then


if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
log "Stage 7: LM Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt, dev_set=${lm_dev_text}"

_opts=
if [ -n "${lm_config}" ]; then
Expand All @@ -987,20 +968,20 @@ if ! "${skip_train}"; then
if [ ! -f "${_split_dir}/.done" ]; then
rm -f "${_split_dir}/.done"
${python} -m espnet2.bin.split_scps \
--scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
--scps "${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
--num_splits "${num_splits_lm}" \
--output_dir "${_split_dir}"
touch "${_split_dir}/.done"
else
log "${_split_dir}/.done exists. Spliting is skipped"
fi

_opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
_opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text "
_opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
_opts+="--multiple_iterator true "

else
_opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
_opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt,text,text "
_opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
fi

Expand Down Expand Up @@ -1073,8 +1054,8 @@ if ! "${skip_train}"; then
fi
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
if "${use_ngram}"; then
log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt"
cut -f 2 -d " " ${data_feats}/lm_train.${src_lang}.${tgt_case}.${tgt_lang}.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
else
log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
Expand Down Expand Up @@ -1485,15 +1466,21 @@ if ! "${skip_eval}"; then
>"${_scoredir}/hyp.trn.org"

# remove utterance id
perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"

# detokenizer
detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"

# rotate result files
if [ ${tgt_case} = "tc" ]; then
pyscripts/utils/rotate_logfile.py ${_scoredir}/result.tc.txt
fi
pyscripts/utils/rotate_logfile.py ${_scoredir}/result.lc.txt

if [ ${tgt_case} = "tc" ]; then
echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
echo "Case sensitive BLEU result (single-reference)" > ${_scoredir}/result.tc.txt
sacrebleu "${_scoredir}/ref.trn.detok" \
-i "${_scoredir}/hyp.trn.detok" \
-m bleu chrf ter \
Expand All @@ -1505,7 +1492,7 @@ if ! "${skip_eval}"; then
# detokenize & remove punctuation except apostrophe
remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
echo "Case insensitive BLEU result (single-reference)" > ${_scoredir}/result.lc.txt
sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
-i "${_scoredir}/hyp.trn.detok.lc.rm" \
-m bleu chrf ter \
Expand All @@ -1531,8 +1518,8 @@ if ! "${skip_eval}"; then
<(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
>"${_scoredir}/ref.trn.org.${ref_idx}"

#
perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
# remove utterance id
perl -pe 's/\([^\)]+\)$//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
Expand Down Expand Up @@ -1646,7 +1633,7 @@ EOF
# shellcheck disable=SC2086
espnet_model_zoo_upload \
--file "${packed_model}" \
--title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
--title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${src_lang}_${tgt_lang}" \
--description_file "${st_exp}"/description \
--creator_name "${_creator_name}" \
--license "CC-BY-4.0" \
Expand Down
2 changes: 1 addition & 1 deletion egs2/covost2/st1/conf/fbank.conf
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
--sample-frequency=8000
--sample-frequency=16000
--num-mel-bins=80
2 changes: 1 addition & 1 deletion egs2/covost2/st1/conf/pitch.conf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
--sample-frequency=8000
--sample-frequency=16000
8 changes: 4 additions & 4 deletions egs2/covost2/st1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ tgt_case=lc.rm

train_set=train.${src_lang}-${tgt_lang}
train_dev=dev.${src_lang}-${tgt_lang}
test_set="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} "
test_sets="test.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}"

st_config=conf/train_st.yaml
inference_config=conf/decode_st.yaml
Expand Down Expand Up @@ -75,9 +75,9 @@ if [[ ${is_exist} == false ]]; then
fi

if [ ${is_low_resource} = true ]; then
speed_perturb_factors="0.9 1.0 1.1"
else
speed_perturb_factors="0.8 0.9 1.0 1.1 1.2"
else
speed_perturb_factors="0.9 1.0 1.1"
fi

if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
Expand Down Expand Up @@ -108,7 +108,7 @@ fi
--inference_config "${inference_config}" \
--train_set "${train_set}" \
--valid_set "${train_dev}" \
--test_sets "${test_set}" \
--test_sets "${test_sets}" \
--src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
--tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
--lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@"

0 comments on commit 3c96908

Please sign in to comment.