From b6196c6170603cc47707464dfb1e264c1a2e3ddb Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Thu, 24 Mar 2022 11:14:03 -0400 Subject: [PATCH 1/3] bug fixes in data prep --- egs2/ms_indic_18/asr1/local/data.sh | 10 ++++++---- egs2/ms_indic_18/asr1/local/prepare_data.py | 21 +++++++++------------ egs2/ms_indic_18/asr1/run.sh | 6 +++--- 3 files changed, 18 insertions(+), 19 deletions(-) mode change 100644 => 100755 egs2/ms_indic_18/asr1/local/data.sh mode change 100644 => 100755 egs2/ms_indic_18/asr1/local/prepare_data.py mode change 100644 => 100755 egs2/ms_indic_18/asr1/run.sh diff --git a/egs2/ms_indic_18/asr1/local/data.sh b/egs2/ms_indic_18/asr1/local/data.sh old mode 100644 new mode 100755 index a114bd73911..45f83f89d2a --- a/egs2/ms_indic_18/asr1/local/data.sh +++ b/egs2/ms_indic_18/asr1/local/data.sh @@ -13,7 +13,7 @@ stop_stage=100 SECONDS=0 lang=te # te ta gu - . utils/parse_options.sh || exit 1; +. utils/parse_options.sh || exit 1; log() { @@ -41,11 +41,12 @@ log "data preparation started" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Train" ]]; then - log "stage0: Download data to ${MS_INDIC_IS18}. ${lang}-in-Train} directory is missing" + log "stage0: Download training data to ${MS_INDIC_IS18}. ${lang}-in-Train directory is missing" + exit 1 elif [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Test" ]]; then - log "stage0: Download data to ${MS_INDIC_IS18}. ${lang}-in-Test} directory is missing" + log "stage0: Download test data to ${MS_INDIC_IS18}. ${lang}-in-Test directory is missing" + exit 1 fi - exit 1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -54,4 +55,5 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then local/prepare_data.py ${MS_INDIC_IS18} ${lang} fi + log "Successfully finished. [elapsed=${SECONDS}s]" diff --git a/egs2/ms_indic_18/asr1/local/prepare_data.py b/egs2/ms_indic_18/asr1/local/prepare_data.py old mode 100644 new mode 100755 index ffe2f43ba35..488cd920ff6 --- a/egs2/ms_indic_18/asr1/local/prepare_data.py +++ b/egs2/ms_indic_18/asr1/local/prepare_data.py @@ -6,9 +6,10 @@ import os -import soundfile as sf import random import sys +import librosa + if len(sys.argv) != 3: print("Usage: python prepare_data.py [data-directory] [language-ID]") @@ -21,7 +22,7 @@ testdir = f"{datadir}/{lang}-in-Test/" train_datadir = f"data/train_{lang}/" -valid_datadir = f"data/valid_{lang}/" +valid_datadir = f"data/dev_{lang}/" test_datadir = f"data/test_{lang}/" os.popen(f"mkdir -p {train_datadir}").read() @@ -30,26 +31,22 @@ # prepare data for training and validation splits -with open(traindir+'transcriptions.txt') as f: +with open(traindir+'transcription.txt') as f: train_lines = [line.rstrip() for line in f.readlines()] train_id2text = {} - train_id2dur = {} + train_id2filepath = {} for line in train_lines: wav_id = line.split()[0] filepath = f"{traindir}/Audios/{wav_id}.wav" train_id2text[wav_id] = ' '.join(line.split()[1:]) train_id2filepath[wav_id] = filepath -def get_duration(filepath): - x,f = sf.read(filepath) - return len(x)/f - wav_ids = list(train_id2text.keys()) random.shuffle(wav_ids) valid_id2text = {} valid_totaldur = 2*60*60 # (in seconds) 2 hours taken for validation split for wav_id in wav_ids: - dur = get_duration(train_id2filepath[wav_id]) + dur = librosa.get_duration(filename=train_id2filepath[wav_id]) valid_id2text[wav_id] = train_id2text.pop(wav_id) valid_totaldur -= dur if valid_totaldur < 0: @@ -58,7 +55,7 @@ def get_duration(filepath): with open(train_datadir+'text', 'w') as f: for wav_id in sorted(train_id2text): - f.write(f"{lang}_{wav_id} {test_id2text[wav_id]}\n") + f.write(f"{lang}_{wav_id} {train_id2text[wav_id]}\n") with open(train_datadir+'wav.scp', 'w') as f: for wav_id in sorted(train_id2text): f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n") @@ -71,7 +68,7 @@ def get_duration(filepath): with open(valid_datadir+'text', 'w') as f: for wav_id in sorted(valid_id2text): - f.write(f"{lang}_{wav_id} {test_id2text[wav_id]}\n") + f.write(f"{lang}_{wav_id} {valid_id2text[wav_id]}\n") with open(valid_datadir+'wav.scp', 'w') as f: for wav_id in sorted(valid_id2text): f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n") @@ -84,7 +81,7 @@ def get_duration(filepath): # prepare test data -with open(testdir+'transcriptions.txt') as f: +with open(testdir+'transcription.txt') as f: test_lines = [line.rstrip() for line in f.readlines()] test_id2text = {} test_id2filepath = {} diff --git a/egs2/ms_indic_18/asr1/run.sh b/egs2/ms_indic_18/asr1/run.sh old mode 100644 new mode 100755 index a48441d5070..e2a8c317a51 --- a/egs2/ms_indic_18/asr1/run.sh +++ b/egs2/ms_indic_18/asr1/run.sh @@ -15,11 +15,11 @@ asr_config=conf/train_asr.yaml lm_config=conf/train_lm.yaml inference_config=conf/decoder_asr.yaml -if [[ "zh" == *"${lang}"* ]]; then +if [[ "zh" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=te nbpe=2500 -elif [[ "fr" == *"${lang}"* ]]; then +elif [[ "fr" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=ta nbpe=350 -elif [[ "es" == *"${lang}"* ]]; then +elif [[ "es" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=gu nbpe=235 else nbpe=150 From 7dcbc5a5c52613d01b41b64303f8cb4772d93c9d Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Thu, 24 Mar 2022 11:22:37 -0400 Subject: [PATCH 2/3] Update data.sh removed unused variables --- egs2/ms_indic_18/asr1/local/data.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/egs2/ms_indic_18/asr1/local/data.sh b/egs2/ms_indic_18/asr1/local/data.sh index 45f83f89d2a..5d750bcd4ae 100755 --- a/egs2/ms_indic_18/asr1/local/data.sh +++ b/egs2/ms_indic_18/asr1/local/data.sh @@ -33,10 +33,6 @@ set -e set -u set -o pipefail -train_set=train_"$(echo "${lang}" | tr - _)" -train_dev=dev_"$(echo "${lang}" | tr - _)" -test_set=test_"$(echo "${lang}" | tr - _)" - log "data preparation started" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then From c4afcc3f0f1a15ff7cf83de8c1ac09d7c7e4d59e Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Thu, 24 Mar 2022 11:38:49 -0400 Subject: [PATCH 3/3] Update data.sh fixed bug with trying to create empty directory --- egs2/ms_indic_18/asr1/local/data.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/egs2/ms_indic_18/asr1/local/data.sh b/egs2/ms_indic_18/asr1/local/data.sh index 5d750bcd4ae..9a9c709a206 100755 --- a/egs2/ms_indic_18/asr1/local/data.sh +++ b/egs2/ms_indic_18/asr1/local/data.sh @@ -21,7 +21,6 @@ log() { echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } -mkdir -p ${MS_INDIC_IS18} if [ -z "${MS_INDIC_IS18}" ]; then log "Fill the value of 'MS_INDIC_IS18' of db.sh" exit 1