diff --git a/egs2/mini_an4/st1/local/data.sh b/egs2/mini_an4/st1/local/data.sh new file mode 100644 index 00000000000..030b7c30df5 --- /dev/null +++ b/egs2/mini_an4/st1/local/data.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +log() { + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} +SECONDS=0 + +stage=1 +stop_stage=100 + +an4_root=./downloads/an4 + +log "$0 $*" +. utils/parse_options.sh + +if [ $# -ne 0 ]; then + log "Error: No positional arguments are required." + exit 2 +fi + +. ./path.sh +. ./cmd.sh + +train_set="train_nodev" +train_dev="train_dev" + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + log "stage 1: Untar downloads.tar.gz" + if [ ! -e downloads/ ]; then + tar -xvf downloads.tar.gz + fi +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + log "stage 2: Data preparation" + mkdir -p data/{train,test} + + if [ ! -f ${an4_root}/README ]; then + echo Cannot find an4 root! Exiting... + exit 1 + fi + + python3 local/data_prep.py ${an4_root} sph2pipe + + for x in test train; do + for f in text wav.scp utt2spk; do + sort data/${x}/${f} -o data/${x}/${f} + done + utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt + done + + # make a dev set + utils/subset_data_dir.sh --first data/train 1 data/${train_dev} + n=$(($(wc -l < data/train/text) - 1)) + utils/subset_data_dir.sh --last data/train ${n} data/${train_set} + + # Create "test_seg" in order to test the use case of segments + rm -rf data/test_seg + utils/copy_data_dir.sh data/test data/test_seg + data/test_seg/wav.scp + cat << EOF > data/test_seg/segments +fcaw-cen8-b fcaw-cen8-b_org 0.0 2.9 +mmxg-cen8-b mmxg-cen8-b_org 0.0 2.3 +EOF + + # for enh task + for x in test ${train_set} ${train_dev}; do + cp data/${x}/wav.scp data/${x}/spk1.scp + done +fi + +for x in test test_seg ${train_set} ${train_dev}; do + cp data/${x}/text data/${x}/text.lc.rm.en + cp data/${x}/text data/${x}/text.tc.en +done + +log "Successfully finished. [elapsed=${SECONDS}s]" diff --git a/egs2/mini_an4/st1/local/data_prep.py b/egs2/mini_an4/st1/local/data_prep.py new file mode 100644 index 00000000000..d416349ede8 --- /dev/null +++ b/egs2/mini_an4/st1/local/data_prep.py @@ -0,0 +1 @@ +../../../../egs/an4/asr1/local/data_prep.py \ No newline at end of file diff --git a/egs2/mini_an4/st1/local/download_and_untar.sh b/egs2/mini_an4/st1/local/download_and_untar.sh new file mode 100644 index 00000000000..40bf437ab02 --- /dev/null +++ b/egs2/mini_an4/st1/local/download_and_untar.sh @@ -0,0 +1 @@ +../../../../egs/an4/asr1/local/download_and_untar.sh \ No newline at end of file