Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
ftshijt committed Apr 27, 2022
2 parents 72b6b21 + 4a12ab3 commit 04d0cd8
Show file tree
Hide file tree
Showing 138 changed files with 4,329 additions and 127 deletions.
10 changes: 10 additions & 0 deletions ci/test_integration_espnet2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ if python3 -c "import fairseq" &> /dev/null; then
cd "${cwd}"
fi

# [ESPnet2] test enh_asr1 recipe
if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null; then
cd ./egs2/mini_an4/enh_asr1
echo "==== [ESPnet2] ENH_ASR ==="
./run.sh --ngpu 0 --stage 0 --stop-stage 15 --skip-upload_hf false --feats-type "raw" --spk-num 1 --enh_asr_args "--max_epoch=1 --enh_separator_conf num_spk=1" --python "${python}"
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data
cd "${cwd}"
fi

# [ESPnet2] Validate configuration files
echo "<blank>" > dummy_token_list
echo "==== [ESPnet2] Validation configuration files ==="
Expand Down
2 changes: 1 addition & 1 deletion egs/commonvoice/asr1/local/download_and_untar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ fi

if [ $# -ne 3 ]; then
echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
echo "e.g.: $0 /export/data/ https://us.openslr.org/resources/108/FR.tgz"
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
exit 0;
fi
Expand Down
2 changes: 2 additions & 0 deletions egs2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
| fsc_challenge | Fluent Speech Commands Dataset MASE Eval Challenge splits | SLU | ENG | https://github.com/maseEval/mase | |
| gigaspeech | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR | ENG | https://github.com/SpeechColab/GigaSpeech | |
| grabo | Grabo dataset | SLU | ENG + NLD | https://www.esat.kuleuven.be/psi/spraak/downloads/ | |
| harpervalley | HarperValleyBank: A Domain-Specific Spoken Dialog Corpus | SLU | ENG | https://github.com/cricketclub/gridspace-stanford-harper-valley | |
| hkust | HKUST/MTS: A very large scale Mandarin telephone speech corpus | ASR | CMN | https://catalog.ldc.upenn.edu/LDC2005S15 | |
| hui_acg | HUI-audio-corpus-german | TTS | DEU | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german | |
| how2 | How2: A Large-scale Dataset for Multimodal Language Understanding | ASR/MT/ST | ENG->POR | https://github.com/srvk/how2-dataset | |
Expand All @@ -61,6 +62,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
| ljspeech | The LJ Speech Dataset | TTS | ENG | https://keithito.com/LJ-Speech-Dataset/ | |
| lrs3 | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset | ASR | ENG | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html | |
| lrs2 | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset | Lipreading/ASR | ENG | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html | |
| mediaspeech | MediaSpeech: Multilanguage ASR Benchmark and Dataset | ASR | FRA | https://www.openslr.org/108/ | |
| microsoft_speech | Microsoft Speech Corpus (Indian languages) | ASR | 3 languages | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e | |
| mini_an4 | Mini version of CMU AN4 database for the integration test | ASR/TTS/SE | ENG | http://www.speech.cs.cmu.edu/databases/an4/ | |
| mini_librispeech | Mini version of Librispeech corpus | DIAR | ENG | https://openslr.org/31/ | |
Expand Down
3 changes: 3 additions & 0 deletions egs2/TEMPLATE/asr1/db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ LIBRILIGHT_LIMITED=
FSC=
SLURP=
VOXCELEB=
MEDIASPEECH=downloads
MINI_LIBRISPEECH=downloads
MISP2021=
LIBRIMIX=downloads
Expand Down Expand Up @@ -139,6 +140,7 @@ MALAYALAM=downloads
ST_CMDS=downloads
MS_INDIC_IS18=
MARATHI=downloads
HARPERVALLEY=downloads

# For only CMU TIR environment
if [[ "$(hostname)" == tir* ]]; then
Expand Down Expand Up @@ -216,6 +218,7 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
FSC=
SNIPS= # smart-light-en-closed-field data path
SLURP=
MEDIASPEECH=downloads
MINI_LIBRISPEECH=downloads
LIBRITTS=
LJSPEECH=downloads
Expand Down
26 changes: 19 additions & 7 deletions egs2/TEMPLATE/enh1/enh.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ download_model=
# Evaluation related
scoring_protocol="STOI SDR SAR SIR SI_SNR"
ref_channel=0
inference_tag= # Prefix to the result dir for ENH inference.
inference_enh_config= # Config for enhancement.
score_with_asr=false
asr_exp="" # asr model for scoring WER
lm_exp="" # lm model for scoring WER
Expand Down Expand Up @@ -151,8 +153,9 @@ Options:
--init_param # pretrained model path and module name (default="${init_param}")
# Enhancement related
--inference_args # Arguments for enhancement in the inference stage (default="${inference_args}")
--inference_model # Enhancement model path for inference (default="${inference_model}").
--inference_args # Arguments for enhancement in the inference stage (default="${inference_args}")
--inference_model # Enhancement model path for inference (default="${inference_model}").
--inference_enh_config # Configuration file for overwriting some model attributes during SE inference. (default="${inference_enh_config}")
# Evaluation related
--scoring_protocol # Metrics to be used for scoring (default="${scoring_protocol}")
Expand Down Expand Up @@ -250,6 +253,14 @@ if [ -n "${speed_perturb_factors}" ]; then
enh_exp="${enh_exp}_sp"
fi

if [ -z "${inference_tag}" ]; then
if [ -n "${inference_enh_config}" ]; then
inference_tag="$(basename "${inference_enh_config}" .yaml)"
else
inference_tag=enhanced
fi
fi

# ========================== Main stages start from here. ==========================

if ! "${skip_data_prep}"; then
Expand Down Expand Up @@ -617,7 +628,7 @@ if ! "${skip_eval}"; then

for dset in "${valid_set}" ${test_sets}; do
_data="${data_feats}/${dset}"
_dir="${enh_exp}/enhanced_${dset}"
_dir="${enh_exp}/${inference_tag}_${dset}"
_logdir="${_dir}/logdir"
mkdir -p "${_logdir}"

Expand Down Expand Up @@ -649,6 +660,7 @@ if ! "${skip_eval}"; then
--data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
--key_file "${_logdir}"/keys.JOB.scp \
--train_config "${enh_exp}"/config.yaml \
${inference_enh_config:+--inference_config "$inference_enh_config"} \
--model_file "${enh_exp}"/"${inference_model}" \
--output_dir "${_logdir}"/output.JOB \
${_opts} ${inference_args}
Expand Down Expand Up @@ -689,7 +701,7 @@ if ! "${skip_eval}"; then
if "${score_obs}"; then
_dir="${data_feats}/${dset}/scoring"
else
_dir="${enh_exp}/enhanced_${dset}/scoring"
_dir="${enh_exp}/${inference_tag}_${dset}/scoring"
fi

_logdir="${_dir}/logdir"
Expand All @@ -716,7 +728,7 @@ if ! "${skip_eval}"; then
# To compute the score of observation, input original wav.scp
_inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
else
_inf_scp+="--inf_scp ${enh_exp}/enhanced_${dset}/spk${spk}.scp "
_inf_scp+="--inf_scp ${enh_exp}/${inference_tag}_${dset}/spk${spk}.scp "
fi
done

Expand Down Expand Up @@ -752,7 +764,7 @@ if ! "${skip_eval}"; then
./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS.md"
done
log "Evaluation result for observation: ${data_feats}/RESULTS.md"
log "Evaluation result for enhancement: ${enh_exp}/enhanced/RESULTS.md"
log "Evaluation result for enhancement: ${enh_exp}/RESULTS.md"

fi
else
Expand Down Expand Up @@ -811,7 +823,7 @@ if "${score_with_asr}"; then
# Using same wav.scp for all speakers
cp "${_data}/wav.scp" "${_ddir}/wav.scp"
else
cp "${enh_exp}/enhanced_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
cp "${enh_exp}/${inference_tag}_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
fi
cp data/${dset}/text_spk${spk} ${_ddir}/text
cp ${_data}/{spk2utt,utt2spk,utt2num_samples,feats_type} ${_ddir}
Expand Down
2 changes: 1 addition & 1 deletion egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fi
[ -f ./path.sh ] && . ./path.sh
set -euo pipefail
if [ $# -eq 1 ]; then
exp=$1
exp=$(realpath "$1")
else
exp=exp
fi
Expand Down
39 changes: 27 additions & 12 deletions egs2/TEMPLATE/enh_asr1/enh_asr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ dereverb_ref_num=1
# Evaluation related
scoring_protocol="STOI SDR SAR SIR SI_SNR"
ref_channel=0
inference_enh_tag= # Prefix to the result dir for ENH inference.
inference_enh_config= # Config for enhancement.

# Enh Training data related
use_dereverb_ref=false
Expand Down Expand Up @@ -453,6 +455,14 @@ if [ -z "${inference_tag}" ]; then
fi
fi

if [ -z "${inference_enh_tag}" ]; then
if [ -n "${inference_enh_config}" ]; then
inference_enh_tag="$(basename "${inference_enh_config}" .yaml)"
else
inference_enh_tag=enhanced
fi
fi

# ========================== Main stages start from here. ==========================

if ! "${skip_data_prep}"; then
Expand Down Expand Up @@ -518,7 +528,10 @@ if ! "${skip_data_prep}"; then
expand_utt_extra_files=""
for extra_file in ${utt_extra_files}; do
# with regex to suuport multi-references
for single_file in $(ls data/"${dset}"/${extra_file}*); do
for single_file in "data/${dset}/${extra_file}"*; do
if [ ! -f "${single_file}" ]; then
continue
fi
cp ${single_file} "${data_feats}${_suf}/${dset}"
expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
done
Expand Down Expand Up @@ -553,7 +566,7 @@ if ! "${skip_data_prep}"; then
# shellcheck disable=SC2086
scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
--out-filename "${spk}.scp" \
--ref_channels "0" \
--ref_channels "${ref_channel}" \
--audio-format "${audio_format}" --fs "${fs}" ${_opts} \
"data/${dset}/${spk}.scp" "${data_feats}${_suf}/${dset}" \
"${data_feats}${_suf}/${dset}/logs/${spk}" "${data_feats}${_suf}/${dset}/data/${spk}"
Expand Down Expand Up @@ -1259,6 +1272,7 @@ if ! "${skip_eval}"; then
# shellcheck disable=SC2086
${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
${python} -m ${asr_inference_tool} \
--enh_s2t_task true \
--batch_size ${batch_size} \
--ngpu "${_ngpu}" \
--data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
Expand Down Expand Up @@ -1293,12 +1307,12 @@ if ! "${skip_eval}"; then
_opts=

# 2. Generate run.sh
log "Generate '${enh_asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 13 using this script"
mkdir -p "${enh_asr_exp}/${inference_tag}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_asr_exp}/${inference_tag}/run.sh"; chmod +x "${enh_asr_exp}/${inference_tag}/run.sh"
log "Generate '${enh_asr_exp}/run_enhance.sh'. You can resume the process from stage 13 using this script"
mkdir -p "${enh_asr_exp}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${enh_asr_exp}/run_enhance.sh"; chmod +x "${enh_asr_exp}/run_enhance.sh"

for dset in ${test_sets}; do
_data="${data_feats}/${dset}"
_dir="${enh_asr_exp}/${inference_tag}/${dset}"
_dir="${enh_asr_exp}/${inference_enh_tag}_${dset}"
_logdir="${_dir}/logdir"
mkdir -p "${_logdir}"

Expand Down Expand Up @@ -1330,6 +1344,7 @@ if ! "${skip_eval}"; then
--data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
--key_file "${_logdir}"/keys.JOB.scp \
--train_config "${enh_asr_exp}"/config.yaml \
${inference_enh_config:+--inference_config "$inference_enh_config"} \
--model_file "${enh_asr_exp}"/"${inference_enh_asr_model}" \
--output_dir "${_logdir}"/output.JOB \
${_opts} ${enh_inference_args}
Expand Down Expand Up @@ -1472,17 +1487,17 @@ if ! "${skip_eval}"; then
# for score_obs in true false; do
for score_obs in true false; do
# Peform only at the first time for observation
if "${score_obs}" && [ -e "${data_feats}/RESULTS.md" ]; then
log "${data_feats}/RESULTS.md already exists. The scoring for observation will be skipped"
if "${score_obs}" && [ -e "${data_feats}/RESULTS_enh.md" ]; then
log "${data_feats}/RESULTS_enh.md already exists. The scoring for observation will be skipped"
continue
fi

for dset in ${test_sets}; do
_data="${data_feats}/${dset}"
if "${score_obs}"; then
_dir="${data_feats}/${dset}/scoring_enh"
_dir="${data_feats}/${dset}/scoring"
else
_dir="${enh_asr_exp}/${inference_tag}/${dset}/scoring_enh"
_dir="${enh_asr_exp}/${inference_enh_tag}_${dset}/scoring"
fi

_logdir="${_dir}/logdir"
Expand All @@ -1508,7 +1523,7 @@ if ! "${skip_eval}"; then
# To compute the score of observation, input original wav.scp
_inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
else
_inf_scp+="--inf_scp ${enh_asr_exp}/${inference_tag}/${dset}/spk${spk}.scp "
_inf_scp+="--inf_scp ${enh_asr_exp}/${inference_enh_tag}_${dset}/spk${spk}.scp "
fi
done

Expand Down Expand Up @@ -1544,7 +1559,7 @@ if ! "${skip_eval}"; then
./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS_enh.md"
done
log "Evaluation result for observation: ${data_feats}/RESULTS_enh.md"
log "Evaluation result for enhancement: ${enh_asr_exp}/enhanced/RESULTS_enh.md"
log "Evaluation result for enhancement: ${enh_asr_exp}/RESULTS_enh.md"

fi
else
Expand Down Expand Up @@ -1620,7 +1635,7 @@ if ! "${skip_upload_hf}"; then
# shellcheck disable=SC2034
espnet_task=EnhS2T
# shellcheck disable=SC2034
task_exp=${enh_st_exp}
task_exp=${enh_asr_exp}
eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md

this_folder=${PWD}
Expand Down
Loading

0 comments on commit 04d0cd8

Please sign in to comment.