From 98689a5f0bfd88efffdbbcdd5d924e186d563a91 Mon Sep 17 00:00:00 2001 From: kamo-naoyuki Date: Thu, 12 May 2022 21:17:35 +0900 Subject: [PATCH] change to show the error logs when jobs are failed --- .../asr1/scripts/utils/evaluate_asr.sh | 4 +- egs2/TEMPLATE/diar1/diar.sh | 8 +- egs2/TEMPLATE/enh1/enh.sh | 8 +- egs2/TEMPLATE/enh_asr1/enh_asr.sh | 12 +-- egs2/TEMPLATE/enh_st1/enh_st.sh | 32 +++--- egs2/TEMPLATE/mt1/mt.sh | 32 +++--- egs2/TEMPLATE/ssl1/hubert.sh | 100 +++++++++--------- egs2/TEMPLATE/st1/st.sh | 42 ++++---- egs2/TEMPLATE/tts1/tts.sh | 8 +- 9 files changed, 123 insertions(+), 123 deletions(-) diff --git a/egs2/TEMPLATE/asr1/scripts/utils/evaluate_asr.sh b/egs2/TEMPLATE/asr1/scripts/utils/evaluate_asr.sh index 7d3da2bfbea..0cc2c632591 100755 --- a/egs2/TEMPLATE/asr1/scripts/utils/evaluate_asr.sh +++ b/egs2/TEMPLATE/asr1/scripts/utils/evaluate_asr.sh @@ -173,14 +173,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # 2. Submit decoding jobs log "Decoding started... log: '${logdir}/asr_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${logdir}"/asr_inference.JOB.log \ python3 -m espnet2.bin.asr_inference \ --ngpu "${_ngpu}" \ --data_path_and_name_and_type "${wavscp},speech,sound" \ --key_file "${logdir}"/keys.JOB.scp \ --output_dir "${logdir}"/output.JOB \ - "${_opts[@]}" ${inference_args} + "${_opts[@]}" ${inference_args} || { cat $(grep -l -i error "${logdir}"/asr_inference.*.log) ; exit 1; } # 3. Concatenates the output files from each jobs for f in token token_int score text; do diff --git a/egs2/TEMPLATE/diar1/diar.sh b/egs2/TEMPLATE/diar1/diar.sh index 815c73537f4..b711d324eab 100755 --- a/egs2/TEMPLATE/diar1/diar.sh +++ b/egs2/TEMPLATE/diar1/diar.sh @@ -348,7 +348,7 @@ if ! "${skip_train}"; then # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted, # but it's used only for deciding the sample ids. - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.diar_train \ --collect_stats true \ @@ -360,7 +360,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/valid.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${diar_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${diar_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -510,7 +510,7 @@ if ! "${skip_eval}"; then # 2. Submit inference jobs log "Diarization started... log: '${_logdir}/diar_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/diar_inference.JOB.log \ ${python} -m espnet2.bin.diar_inference \ --ngpu "${_ngpu}" \ @@ -520,7 +520,7 @@ if ! "${skip_eval}"; then --train_config "${diar_exp}"/config.yaml \ --model_file "${diar_exp}"/"${inference_model}" \ --output_dir "${_logdir}"/output.JOB \ - ${_opts} + ${_opts} || { cat $(grep -l -i error "${_logdir}"/diar_inference.*.log) ; exit 1; } # 3. Concatenates the output files from each jobs for i in $(seq "${_nj}"); do diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh index db170043db6..864a0485df0 100755 --- a/egs2/TEMPLATE/enh1/enh.sh +++ b/egs2/TEMPLATE/enh1/enh.sh @@ -494,7 +494,7 @@ if ! "${skip_train}"; then # but it's used only for deciding the sample ids. - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.enh_train \ --collect_stats true \ @@ -504,7 +504,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/valid.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${enh_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${enh_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -652,7 +652,7 @@ if ! "${skip_eval}"; then # 2. Submit inference jobs log "Enhancement started... log: '${_logdir}/enh_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \ ${python} -m espnet2.bin.enh_inference \ --ngpu "${_ngpu}" \ @@ -663,7 +663,7 @@ if ! "${skip_eval}"; then ${inference_enh_config:+--inference_config "$inference_enh_config"} \ --model_file "${enh_exp}"/"${inference_model}" \ --output_dir "${_logdir}"/output.JOB \ - ${_opts} ${inference_args} + ${_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/enh_inference.*.log) ; exit 1; } _spk_list=" " diff --git a/egs2/TEMPLATE/enh_asr1/enh_asr.sh b/egs2/TEMPLATE/enh_asr1/enh_asr.sh index fc720ddf94b..9ec09219613 100755 --- a/egs2/TEMPLATE/enh_asr1/enh_asr.sh +++ b/egs2/TEMPLATE/enh_asr1/enh_asr.sh @@ -794,7 +794,7 @@ if ! "${skip_train}"; then log "LM collect-stats started... log: '${_logdir}/stats.*.log'" # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted, # but it's used only for deciding the sample ids. - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.lm_train \ --collect_stats true \ @@ -810,7 +810,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/dev.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -937,7 +937,7 @@ if ! "${skip_train}"; then if "${use_ngram}"; then log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt" cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa - build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin + build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin else log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}" fi @@ -1335,7 +1335,7 @@ if ! "${skip_eval}"; then # 2. Submit inference jobs log "Enhancement started... log: '${_logdir}/enh_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \ ${python} -m espnet2.bin.enh_inference \ --enh_s2t_task true \ @@ -1347,7 +1347,7 @@ if ! "${skip_eval}"; then ${inference_enh_config:+--inference_config "$inference_enh_config"} \ --model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \ --output_dir "${_logdir}"/output.JOB \ - ${_opts} ${enh_inference_args} + ${_opts} ${enh_inference_args} || { cat $(grep -l -i error "${_logdir}"/enh_inference.*.log) ; exit 1; } # 3. Concatenates the output files from each jobs _spk_list=" " @@ -1632,7 +1632,7 @@ if ! "${skip_upload_hf}"; then # Generate description file # shellcheck disable=SC2034 hf_task=speech-enhancement-recognition - # shellcheck disable=SC2034 + # shellcheck disable=SC2034 espnet_task=EnhS2T # shellcheck disable=SC2034 task_exp=${enh_asr_exp} diff --git a/egs2/TEMPLATE/enh_st1/enh_st.sh b/egs2/TEMPLATE/enh_st1/enh_st.sh index eabf49cc29d..b27f986e582 100755 --- a/egs2/TEMPLATE/enh_st1/enh_st.sh +++ b/egs2/TEMPLATE/enh_st1/enh_st.sh @@ -551,7 +551,7 @@ if ! "${skip_data_prep}"; then done utils/combine_data.sh --extra_files "${utt_extra_files} ${_scp_list}" "data/${train_set}_sp" ${_dirs} for extra_file in ${utt_extra_files}; do - python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp + python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file} done else @@ -593,7 +593,7 @@ if ! "${skip_data_prep}"; then fi cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" - done + done done echo "${expand_utt_extra_files}" utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}" @@ -727,9 +727,9 @@ if ! "${skip_data_prep}"; then utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}" for utt_extra_file in ${utt_extra_files}; do python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \ - > ${data_feats}/${dset}/${utt_extra_file}.tmp + > ${data_feats}/${dset}/${utt_extra_file}.tmp mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file} - done + done done # shellcheck disable=SC2002 @@ -934,7 +934,7 @@ if ! "${skip_train}"; then log "LM collect-stats started... log: '${_logdir}/stats.*.log'" # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted, # but it's used only for deciding the sample ids. - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.lm_train \ --collect_stats true \ @@ -950,7 +950,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/dev.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -1078,7 +1078,7 @@ if ! "${skip_train}"; then if "${use_ngram}"; then log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt" cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa - build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin + build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin else log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}" fi @@ -1148,7 +1148,7 @@ if ! "${skip_train}"; then # but it's used only for deciding the sample ids. # TODO(jiatong): fix different bpe model - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.enh_s2t_train \ --collect_stats true \ @@ -1173,7 +1173,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/valid.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${enh_st_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${enh_st_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -1436,7 +1436,7 @@ if ! "${skip_eval}"; then # 2. Submit decoding jobs log "Decoding started... log: '${_logdir}/st_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \ ${python} -m ${st_inference_tool} \ --enh_s2t_task true \ @@ -1447,7 +1447,7 @@ if ! "${skip_eval}"; then --st_train_config "${enh_st_exp}"/config.yaml \ --st_model_file "${enh_st_exp}"/"${inference_enh_st_model}" \ --output_dir "${_logdir}"/output.JOB \ - ${_opts} ${st_inference_args} + ${_opts} ${st_inference_args} || { cat $(grep -l -i error "${_logdir}"/st_inference.*.log) ; exit 1; } # 3. Concatenates the output files from each jobs for f in token token_int score text; do @@ -1773,11 +1773,11 @@ if ! "${skip_upload_hf}"; then gitlfs=$(git lfs --version 2> /dev/null || true) [ -z "${gitlfs}" ] && \ log "ERROR: You need to install git-lfs first" && \ - exit 1 - + exit 1 + dir_repo=${expdir}/hf_${hf_repo//"/"/"_"} [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo} - + if command -v git &> /dev/null; then _creator_name="$(git config user.name)" _checkout="git checkout $(git show -s --format=%H)" @@ -1790,13 +1790,13 @@ if ! "${skip_upload_hf}"; then # foo/asr1 -> foo _corpus="${_task%/*}" _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)" - + # copy files in ${dir_repo} unzip -o ${packed_model} -d ${dir_repo} # Generate description file # shellcheck disable=SC2034 hf_task=speech-enhancement-translation - # shellcheck disable=SC2034 + # shellcheck disable=SC2034 espnet_task=EnhS2T # shellcheck disable=SC2034 task_exp=${enh_st_exp} diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh index bf6996c13c8..02260cb3a4d 100755 --- a/egs2/TEMPLATE/mt1/mt.sh +++ b/egs2/TEMPLATE/mt1/mt.sh @@ -455,7 +455,7 @@ if ! "${skip_data_prep}"; then log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc." # [Task dependent] Need to create data.sh for new corpus local/data.sh ${local_data_opts} - + fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -474,7 +474,7 @@ if ! "${skip_data_prep}"; then # with regex to suuport multi-references for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" - done + done done echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type" done @@ -702,7 +702,7 @@ if ! "${skip_train}"; then log "LM collect-stats started... log: '${_logdir}/stats.*.log'" # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted, # but it's used only for deciding the sample ids. - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.lm_train \ --collect_stats true \ @@ -718,7 +718,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/dev.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -845,7 +845,7 @@ if ! "${skip_train}"; then if "${use_ngram}"; then log "Stage 8: Ngram Training: train_set=${data_feats}/lm_train.txt" cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa - build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin + build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin else log "Stage 8: Skip ngram stages: use_ngram=${use_ngram}" fi @@ -1132,7 +1132,7 @@ if ! "${skip_eval}"; then # 2. Submit decoding jobs log "Decoding started... log: '${_logdir}/mt_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/mt_inference.JOB.log \ ${python} -m ${mt_inference_tool} \ --batch_size ${batch_size} \ @@ -1142,7 +1142,7 @@ if ! "${skip_eval}"; then --mt_train_config "${mt_exp}"/config.yaml \ --mt_model_file "${mt_exp}"/"${inference_mt_model}" \ --output_dir "${_logdir}"/output.JOB \ - ${_opts} ${inference_args} + ${_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/mt_inference.*.log) ; exit 1; } # 3. Concatenates the output files from each jobs for f in token token_int score text; do @@ -1205,7 +1205,7 @@ if ! "${skip_eval}"; then # ) \ # <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \ # >"${_scoredir}/hyp.trn.org" - + # remove utterance id #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn" #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" @@ -1220,7 +1220,7 @@ if ! "${skip_eval}"; then -i "${_scoredir}/hyp.trn.detok" \ -m bleu chrf ter \ >> ${_scoredir}/result.tc.txt - + log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt" fi @@ -1252,8 +1252,8 @@ if ! "${skip_eval}"; then ) \ <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \ >"${_scoredir}/ref.trn.org.${ref_idx}" - - # + + # perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}" detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" @@ -1386,11 +1386,11 @@ if ! "${skip_upload_hf}"; then gitlfs=$(git lfs --version 2> /dev/null || true) [ -z "${gitlfs}" ] && \ log "ERROR: You need to install git-lfs first" && \ - exit 1 - + exit 1 + dir_repo=${expdir}/hf_${hf_repo//"/"/"_"} [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo} - + if command -v git &> /dev/null; then _creator_name="$(git config user.name)" _checkout="git checkout $(git show -s --format=%H)" @@ -1403,13 +1403,13 @@ if ! "${skip_upload_hf}"; then # foo/asr1 -> foo _corpus="${_task%/*}" _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)" - + # copy files in ${dir_repo} unzip -o ${packed_model} -d ${dir_repo} # Generate description file # shellcheck disable=SC2034 hf_task=machine-translation - # shellcheck disable=SC2034 + # shellcheck disable=SC2034 espnet_task=MT # shellcheck disable=SC2034 task_exp=${mt_exp} diff --git a/egs2/TEMPLATE/ssl1/hubert.sh b/egs2/TEMPLATE/ssl1/hubert.sh index 8a6f7590cb8..027b6636782 100755 --- a/egs2/TEMPLATE/ssl1/hubert.sh +++ b/egs2/TEMPLATE/ssl1/hubert.sh @@ -143,7 +143,7 @@ Options: # Pretrain related --pretrain_configs # configration files of pretraining stage --n_clusters # number of k-means clusters of pretraining stage - --features_km # feature for k-means clustering of pretraining stage + --features_km # feature for k-means clustering of pretraining stage --pt_args # Arguments for hubert model pretraining (default="${pt_args}"). # e.g., --pt_args "--max_epoch 10" # Note that it will overwrite args in pt config. @@ -180,7 +180,7 @@ fi [ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; }; # Check pretrain_config, n_clusters and feature list -pretrain_config_list=(${pretrain_configs// / }) +pretrain_config_list=(${pretrain_configs// / }) n_clusters_list=(${n_clusters// / }) feature_list=(${features_km// / }) if ! [ ${pretrain_start_iter} -le ${pretrain_stop_iter} ]; then @@ -227,7 +227,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ "${feats_type}" = raw ]; then log "Stage 3: Format wav.scp: data/ -> ${data_feats}" - + # ====== Recreating "wav.scp" ====== # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |", # shouldn't be used in training process. @@ -235,7 +235,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # and it can also change the audio-format and sampling rate. # If nothing is need, then format_wav_scp.sh does nothing: # i.e. the input file format and rate is same as the output. - + for dset in "${train_set}" "${valid_set}"; do _suf="/org" utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}" @@ -253,7 +253,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \ --audio-format "${audio_format}" --fs "${fs}" ${_opts} \ "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}" - + echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type" done else @@ -265,21 +265,21 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}" - + # NOTE(kamo): Not applying to test_sets to keep original data for dset in "${train_set}" "${valid_set}"; do - + # Copy data dir utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}" cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type" - + # Remove short utterances _feats_type="$(<${data_feats}/${dset}/feats_type)" if [ "${_feats_type}" = raw ]; then _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))") _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))") _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))") - + # utt2num_samples is created by format_wav_scp.sh <"${data_feats}/org/${dset}/utt2num_samples" \ awk -v min_length="${_min_length}" -v max_length="${_max_length}" \ @@ -291,11 +291,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then else log "Error: not supported: --feats_type ${feats_type}" fi - + # Remove empty text <"${data_feats}/org/${dset}/text" \ awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text" - + # fix_data_dir.sh leaves only utts which exist in all files utils/fix_data_dir.sh "${data_feats}/${dset}" done @@ -303,7 +303,7 @@ fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then - + for ((iter=${pretrain_start_iter}; iter<=${pretrain_stop_iter};iter++)); do asr_config="${pretrain_config_list[${iter}]}" if [ "${lang}" != noinfo ]; then @@ -311,25 +311,25 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then else asr_stats_dir="${expdir}/pretrain_iter${iter}_stats_${feats_type}" fi - + if [ -n "${asr_config}" ]; then asr_tag="$(basename "${asr_config}" .yaml)_${feats_type}" else asr_tag="train_${feats_type}" fi - + asr_exp="${expdir}/pretrain_${asr_tag}_iter${iter}" - + train_set_plabel=$(eval "echo ${train_set}_\${feature_list[${iter}]}_km\${n_clusters_list[${iter}]}") valid_set_plabel=$(eval "echo ${valid_set}_\${feature_list[${iter}]}_km\${n_clusters_list[${iter}]}") - + feats_km="${feature_list[${iter}]}" n_clusters="${n_clusters_list[${iter}]}" dictdir="./data/${feats_km}_km${n_clusters}_token_list_iter${iter}/${token_type}" - + if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then log "Stage 5.iter${iter}: Running ${n_clusters} cluster K-means on ${feats_km} feature." - + if [ ${iter} -eq 0 ] || [ ${feats_km} == "mfcc" ]; then ./scripts/km.sh \ --train_set "${train_set}" \ @@ -354,21 +354,21 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then --hubert_dir_path "${expdir}/pretrained_model_iter$((iter-1))"/valid.acc.best.pth fi fi - + if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then _asr_train_dir="${data_feats}/${train_set_plabel}" _asr_valid_dir="${data_feats}/${valid_set_plabel}" - + log "Stage 6.iter${iter}: ${feats_km} pretrain model collect stats: \ train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}" - + _opts= if [ -n "${asr_config}" ]; then # To generate the config file: e.g. # % python3 -m espnet2.bin.asr_train --print_config --optim adam _opts+="--config ${asr_config} " fi - + _feats_type="$(<${_asr_train_dir}/feats_type)" if [ "${_feats_type}" = raw ]; then _scp=wav.scp @@ -385,14 +385,14 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then _input_size="$(<${_asr_train_dir}/feats_dim)" _opts+="--input_size=${_input_size} " fi - + # 1. Split the key file _logdir="${asr_stats_dir}/logdir" mkdir -p "${_logdir}" - + # Get the minimum number among ${nj} and the number lines of input files _nj=$(min "${nj}" "$(<${_asr_train_dir}/${_scp} wc -l)" "$(<${_asr_valid_dir}/${_scp} wc -l)") - + key_file="${_asr_train_dir}/${_scp}" split_scps="" for n in $(seq "${_nj}"); do @@ -400,7 +400,7 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then done # shellcheck disable=SC2086 utils/split_scp.pl "${key_file}" ${split_scps} - + key_file="${_asr_valid_dir}/${_scp}" split_scps="" for n in $(seq "${_nj}"); do @@ -408,18 +408,18 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then done # shellcheck disable=SC2086 utils/split_scp.pl "${key_file}" ${split_scps} - + # 2. Generate run.sh log "Generate '${asr_stats_dir}/run.sh'. You can resume the process from stage 5.iter${iter} using this script" mkdir -p "${asr_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${asr_stats_dir}/run.sh"; chmod +x "${asr_stats_dir}/run.sh" - + # 3. Submit jobs log "Hubert pretraining collect-stats started... log: '${_logdir}/stats.*.log'" - + # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted, # but it's used only for deciding the sample ids. - - # shellcheck disable=SC2086 + + # shellcheck disableSC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.hubert_train \ --collect_stats true \ @@ -439,8 +439,8 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then --valid_shape_file "${_logdir}/valid.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ --hubert_dict "${dictdir}/dict.txt" \ - ${_opts} ${pt_args} || { cat "${_logdir}"/stats.1.log; exit 1; } - + ${_opts} ${pt_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } + # 4. Aggregate shape files _opts= for i in $(seq "${_nj}"); do @@ -448,30 +448,30 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then done # shellcheck disable=SC2086 ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${asr_stats_dir}" - + # Append the num-tokens at the last dimensions. This is used for batch-bins count <"${asr_stats_dir}/train/text_shape" \ awk -v N="$(<${dictdir}/tokens.txt wc -l)" '{ print $0 "," N }' \ >"${asr_stats_dir}/train/text_shape.${token_type}" - + <"${asr_stats_dir}/valid/text_shape" \ awk -v N="$(<${dictdir}/tokens.txt wc -l)" '{ print $0 "," N }' \ >"${asr_stats_dir}/valid/text_shape.${token_type}" fi - + if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then _asr_train_dir="${data_feats}/${train_set_plabel}" _asr_valid_dir="${data_feats}/${valid_set_plabel}" - + log "Stage 7.iter${iter}: Hubert Pretraining: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}" - + _opts= if [ -n "${asr_config}" ]; then # To generate the config file: e.g. # % python3 -m espnet2.bin.hubert_train --print_config --optim adam _opts+="--config ${asr_config} " fi - + _feats_type="$(<${_asr_train_dir}/feats_type)" if [ "${_feats_type}" = raw ]; then _scp=wav.scp @@ -488,14 +488,14 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then _type=kaldi_ark _fold_length="${asr_speech_fold_length}" _input_size="$(<${_asr_train_dir}/feats_dim)" - _opts+="--input_size=${_input_size} " + _opts+="--input_size=${_input_size} " fi - + if [ "${num_splits_asr}" -gt 1 ]; then # If you met a memory error when parsing text files, this option may help you. # The corpus is split into subsets and each subset is used for training one by one in order, # so the memory footprint can be limited to the memory required for each dataset. - + _split_dir="${asr_stats_dir}/splits${num_splits_asr}" if [ ! -f "${_split_dir}/.done" ]; then rm -f "${_split_dir}/.done" @@ -511,23 +511,23 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then else log "${_split_dir}/.done exists. Spliting is skipped" fi - + _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} " _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text " _opts+="--train_shape_file ${_split_dir}/speech_shape " _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} " _opts+="--multiple_iterator true " - + else _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/${_scp},speech,${_type} " _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/text,text,text " _opts+="--train_shape_file ${asr_stats_dir}/train/speech_shape " _opts+="--train_shape_file ${asr_stats_dir}/train/text_shape.${token_type} " fi - + log "Generate '${asr_exp}/run.sh'. You can resume the process from stage 6 using this script" mkdir -p "${asr_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${asr_exp}/run.sh"; chmod +x "${asr_exp}/run.sh" - + # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case log "Hubert pretraining started... log: '${asr_exp}/train.log'" if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then @@ -536,7 +536,7 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then else jobname="${asr_exp}/train.log" fi - + # shellcheck disable=SC2086 ${python} -m espnet2.bin.launch \ --cmd "${cuda_cmd} --name ${jobname}" \ @@ -564,19 +564,19 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then --output_dir "${asr_exp}" \ --hubert_dict "${dictdir}/dict.txt" \ ${_opts} ${pt_args} - + if [ "${iter}" -ge 0 ]; then log "Create a symbolic link of the pretrained model" if [ -L "${expdir}/pretrained_model_iter${iter}" ]; then log "Symbolic link ${expdir}/pretrained_model_iter${iter} already exists, remove it." rm "${expdir}/pretrained_model_iter${iter}" fi - + if ! [ -z "${asr_exp}" ]; then ln -s "../${asr_exp}" "${expdir}/pretrained_model_iter${iter}" fi fi - + log "Model saved in: ${asr_exp}" else log "Skip the pretraining stages" diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh index 18303210f87..ebd2903d7a7 100755 --- a/egs2/TEMPLATE/st1/st.sh +++ b/egs2/TEMPLATE/st1/st.sh @@ -505,9 +505,9 @@ if ! "${skip_data_prep}"; then done utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs} for extra_file in ${utt_extra_files}; do - python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp + python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file} - done + done else log "Skip stage 2: Speed perturbation" fi @@ -544,7 +544,7 @@ if ! "${skip_data_prep}"; then for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" - done + done done echo "${expand_utt_extra_files}" utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}" @@ -589,7 +589,7 @@ if ! "${skip_data_prep}"; then for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" - done + done done for extra_file in ${expand_utt_extra_files}; do LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}" @@ -638,7 +638,7 @@ if ! "${skip_data_prep}"; then for single_file in $(ls data/"${dset}"/${extra_file}*); do cp ${single_file} "${data_feats}${_suf}/${dset}" expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})" - done + done done utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}" for extra_file in ${expand_utt_extra_files}; do @@ -724,9 +724,9 @@ if ! "${skip_data_prep}"; then utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}" for utt_extra_file in ${utt_extra_files}; do python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \ - > ${data_feats}/${dset}/${utt_extra_file}.tmp + > ${data_feats}/${dset}/${utt_extra_file}.tmp mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file} - done + done done # shellcheck disable=SC2002 @@ -931,7 +931,7 @@ if ! "${skip_train}"; then log "LM collect-stats started... log: '${_logdir}/stats.*.log'" # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted, # but it's used only for deciding the sample ids. - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m espnet2.bin.lm_train \ --collect_stats true \ @@ -947,7 +947,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/dev.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${lm_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -1075,7 +1075,7 @@ if ! "${skip_train}"; then if "${use_ngram}"; then log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt" cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa - build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin + build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin else log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}" fi @@ -1427,7 +1427,7 @@ if ! "${skip_eval}"; then # 2. Submit decoding jobs log "Decoding started... log: '${_logdir}/st_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \ ${python} -m ${st_inference_tool} \ --batch_size ${batch_size} \ @@ -1437,7 +1437,7 @@ if ! "${skip_eval}"; then --st_train_config "${st_exp}"/config.yaml \ --st_model_file "${st_exp}"/"${inference_st_model}" \ --output_dir "${_logdir}"/output.JOB \ - ${_opts} ${inference_args} + ${_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/st_inference.*.log) ; exit 1; } # 3. Concatenates the output files from each jobs for f in token token_int score text; do @@ -1483,7 +1483,7 @@ if ! "${skip_eval}"; then ) \ <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \ >"${_scoredir}/hyp.trn.org" - + # remove utterance id perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn" perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn" @@ -1498,7 +1498,7 @@ if ! "${skip_eval}"; then -i "${_scoredir}/hyp.trn.detok" \ -m bleu chrf ter \ >> ${_scoredir}/result.tc.txt - + log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt" fi @@ -1530,8 +1530,8 @@ if ! "${skip_eval}"; then ) \ <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \ >"${_scoredir}/ref.trn.org.${ref_idx}" - - # + + # perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}" detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}" remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}" @@ -1667,11 +1667,11 @@ if ! "${skip_upload_hf}"; then gitlfs=$(git lfs --version 2> /dev/null || true) [ -z "${gitlfs}" ] && \ log "ERROR: You need to install git-lfs first" && \ - exit 1 - + exit 1 + dir_repo=${expdir}/hf_${hf_repo//"/"/"_"} [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo} - + if command -v git &> /dev/null; then _creator_name="$(git config user.name)" _checkout="git checkout $(git show -s --format=%H)" @@ -1684,13 +1684,13 @@ if ! "${skip_upload_hf}"; then # foo/asr1 -> foo _corpus="${_task%/*}" _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)" - + # copy files in ${dir_repo} unzip -o ${packed_model} -d ${dir_repo} # Generate description file # shellcheck disable=SC2034 hf_task=speech-translation - # shellcheck disable=SC2034 + # shellcheck disable=SC2034 espnet_task=ST # shellcheck disable=SC2034 task_exp=${st_exp} diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh index 0bd2e0debb8..13a3aaf2d5d 100755 --- a/egs2/TEMPLATE/tts1/tts.sh +++ b/egs2/TEMPLATE/tts1/tts.sh @@ -644,7 +644,7 @@ if ! "${skip_train}"; then # 3. Submit jobs log "TTS collect_stats started... log: '${_logdir}/stats.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \ ${python} -m "espnet2.bin.${tts_task}_train" \ --collect_stats true \ @@ -665,7 +665,7 @@ if ! "${skip_train}"; then --train_shape_file "${_logdir}/train.JOB.scp" \ --valid_shape_file "${_logdir}/valid.JOB.scp" \ --output_dir "${_logdir}/stats.JOB" \ - ${_opts} ${train_args} || { cat "${_logdir}"/stats.1.log; exit 1; } + ${_opts} ${train_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; } # 4. Aggregate shape files _opts= @@ -1008,7 +1008,7 @@ if ! "${skip_eval}"; then # 3. Submit decoding jobs log "Decoding started... log: '${_logdir}/tts_inference.*.log'" - # shellcheck disable=SC2086 + # shellcheck disable=SC2046,SC2086 ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/tts_inference.JOB.log \ ${python} -m espnet2.bin.tts_inference \ --ngpu "${_ngpu}" \ @@ -1019,7 +1019,7 @@ if ! "${skip_eval}"; then --train_config "${tts_exp}"/config.yaml \ --output_dir "${_logdir}"/output.JOB \ --vocoder_file "${vocoder_file}" \ - ${_opts} ${_ex_opts} ${inference_args} + ${_opts} ${_ex_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/tts_inference.*.log) ; exit 1; } # 4. Concatenates the output files from each jobs if [ -e "${_logdir}/output.${_nj}/norm" ]; then