forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'espnet:master' into master
- Loading branch information
Showing
342 changed files
with
14,770 additions
and
614 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
name: docker-builder | ||
|
||
on: | ||
pull_request: | ||
types: [closed] | ||
branches: | ||
- master | ||
paths: | ||
- 'tools/**' | ||
- setup.py | ||
|
||
jobs: | ||
docker: | ||
runs-on: ubuntu-latest | ||
if: ${{ github.event.pull_request.merged == 'true' }} | ||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Set up QEMU | ||
uses: docker/setup-qemu-action@v1 | ||
|
||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v1 | ||
|
||
- name: Login to DockerHub | ||
uses: docker/login-action@v1 | ||
with: | ||
username: ${{ secrets.DOCKERHUB_USERNAME }} | ||
password: ${{ secrets.DOCKERHUB_TOKEN }} | ||
|
||
- name: Build and push CPU container | ||
run: | | ||
cd docker | ||
docker build --build-arg FROM_TAG=runtime-latest \ | ||
-f prebuilt/devel.dockerfile \ | ||
--target devel \ | ||
-t espnet/espnet:cpu-latest . | ||
docker push espnet/espnet:cpu-latest | ||
- name: Build and push GPU container | ||
run: | | ||
cd docker | ||
docker build --build-arg FROM_TAG=cuda-latest \ | ||
--build-arg CUDA_VER=11.1 \ | ||
-f prebuilt/devel.dockerfile \ | ||
--target devel \ | ||
-t espnet/espnet:gpu-latest . | ||
docker push espnet/espnet:gpu-latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
tuning/decode_pytorch_transformer.yaml |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# rnnlm related | ||
layer: 2 | ||
unit: 650 | ||
opt: sgd # or adam | ||
batchsize: 64 # batch size in LM training | ||
epoch: 20 # if the data size is large, we can reduce this | ||
patience: 3 | ||
maxlen: 100 # if sentence length > lm_maxlen, lm_batchsize is automatically reduced |
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
process: | ||
# these three processes are a.k.a. SpecAugument | ||
- type: "time_warp" | ||
max_time_warp: 5 | ||
inplace: true | ||
mode: "PIL" | ||
- type: "freq_mask" | ||
F: 30 | ||
n_mask: 2 | ||
inplace: true | ||
replace_with_zero: false | ||
- type: "time_mask" | ||
T: 40 | ||
n_mask: 2 | ||
inplace: true | ||
replace_with_zero: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
tuning/train_pytorch_conformer_kernel15.yaml |
8 changes: 8 additions & 0 deletions
8
egs/aesrc2020/asr1/conf/tuning/decode_pytorch_transformer.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
batchsize: 0 | ||
beam-size: 10 | ||
penalty: 0.0 | ||
maxlenratio: 0.0 | ||
minlenratio: 0.0 | ||
ctc-weight: 0.5 | ||
lm-weight: 0.3 | ||
ngram-weight: 0.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
beam-size: 20 | ||
penalty: 0.0 | ||
maxlenratio: 0.0 | ||
minlenratio: 0.0 | ||
ctc-weight: 0.6 | ||
lm-weight: 0.3 |
47 changes: 47 additions & 0 deletions
47
egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# network architecture | ||
# encoder related | ||
elayers: 12 | ||
eunits: 2048 | ||
# decoder related | ||
dlayers: 6 | ||
dunits: 2048 | ||
# attention related | ||
adim: 256 | ||
aheads: 4 | ||
|
||
# hybrid CTC/attention | ||
mtlalpha: 0.3 | ||
|
||
# label smoothing | ||
lsm-weight: 0.1 | ||
|
||
# minibatch related | ||
batch-size: 32 | ||
maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced | ||
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced | ||
|
||
# optimization related | ||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs | ||
opt: noam | ||
accum-grad: 2 | ||
grad-clip: 5 | ||
patience: 0 | ||
epochs: 50 | ||
dropout-rate: 0.1 | ||
|
||
# transformer specific setting | ||
backend: pytorch | ||
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E" | ||
transformer-input-layer: conv2d # encoder architecture type | ||
transformer-lr: 1.0 | ||
transformer-warmup-steps: 25000 | ||
transformer-attn-dropout-rate: 0.0 | ||
transformer-length-normalized-loss: false | ||
transformer-init: pytorch | ||
|
||
# conformer specific setting | ||
transformer-encoder-pos-enc-layer-type: rel_pos | ||
transformer-encoder-selfattn-layer-type: rel_selfattn | ||
macaron-style: true | ||
use-cnn-module: true | ||
cnn-module-kernel: 15 |
47 changes: 47 additions & 0 deletions
47
egs/aesrc2020/asr1/conf/tuning/train_pytorch_conformer_kernel31.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# network architecture | ||
# encoder related | ||
elayers: 12 | ||
eunits: 2048 | ||
# decoder related | ||
dlayers: 6 | ||
dunits: 2048 | ||
# attention related | ||
adim: 256 | ||
aheads: 4 | ||
|
||
# hybrid CTC/attention | ||
mtlalpha: 0.3 | ||
|
||
# label smoothing | ||
lsm-weight: 0.1 | ||
|
||
# minibatch related | ||
batch-size: 32 | ||
maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced | ||
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced | ||
|
||
# optimization related | ||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs | ||
opt: noam | ||
accum-grad: 2 | ||
grad-clip: 5 | ||
patience: 0 | ||
epochs: 50 | ||
dropout-rate: 0.1 | ||
|
||
# transformer specific setting | ||
backend: pytorch | ||
model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E" | ||
transformer-input-layer: conv2d # encoder architecture type | ||
transformer-lr: 1.0 | ||
transformer-warmup-steps: 25000 | ||
transformer-attn-dropout-rate: 0.0 | ||
transformer-length-normalized-loss: false | ||
transformer-init: pytorch | ||
|
||
# conformer specific setting | ||
transformer-encoder-pos-enc-layer-type: rel_pos | ||
transformer-encoder-selfattn-layer-type: rel_selfattn | ||
macaron-style: true | ||
use-cnn-module: true | ||
cnn-module-kernel: 31 |
40 changes: 40 additions & 0 deletions
40
egs/aesrc2020/asr1/conf/tuning/train_pytorch_transformer.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# network architecture | ||
# encoder related | ||
elayers: 12 | ||
eunits: 2048 | ||
# decoder related | ||
dlayers: 6 | ||
dunits: 2048 | ||
# attention related | ||
adim: 256 | ||
aheads: 4 | ||
|
||
# hybrid CTC/attention | ||
mtlalpha: 0.3 | ||
|
||
# label smoothing | ||
lsm-weight: 0.1 | ||
|
||
# minibatch related | ||
batch-size: 32 | ||
maxlen-in: 512 # if input length > maxlen-in, batchsize is automatically reduced | ||
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced | ||
|
||
# optimization related | ||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs | ||
opt: noam | ||
accum-grad: 2 | ||
grad-clip: 5 | ||
patience: 0 | ||
epochs: 50 | ||
dropout-rate: 0.1 | ||
|
||
# transformer specific setting | ||
backend: pytorch | ||
model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E" | ||
transformer-input-layer: conv2d # encoder architecture type | ||
transformer-lr: 2.0 | ||
transformer-warmup-steps: 25000 | ||
transformer-attn-dropout-rate: 0.0 | ||
transformer-length-normalized-loss: false | ||
transformer-init: pytorch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# network architecture | ||
# encoder related | ||
etype: vggblstm # encoder architecture type | ||
elayers: 3 | ||
eunits: 1024 | ||
eprojs: 1024 | ||
subsample: "1_2_2_1_1" # skip every n frame from input to nth layers | ||
# decoder related | ||
dlayers: 2 | ||
dunits: 1024 | ||
# attention related | ||
atype: location | ||
adim: 1024 | ||
aconv-chans: 10 | ||
aconv-filts: 100 | ||
|
||
# hybrid CTC/attention | ||
mtlalpha: 0.5 | ||
|
||
# minibatch related | ||
batch-size: 30 | ||
maxlen-in: 800 # if input length > maxlen_in, batchsize is automatically reduced | ||
maxlen-out: 150 # if output length > maxlen_out, batchsize is automatically reduced | ||
|
||
# optimization related | ||
opt: adadelta | ||
epochs: 10 | ||
patience: 0 | ||
|
||
# scheduled sampling option | ||
sampling-probability: 0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/bin/bash | ||
|
||
. ./path.sh || exit 1; | ||
. ./cmd.sh || exit 1; | ||
|
||
data=$1 # data transformed into kaldi format | ||
|
||
# divide development set for cross validation | ||
if [ -d ${data} ];then | ||
for i in US UK IND CHN JPN PT RU KR CA ES;do | ||
./utils/subset_data_dir.sh --spk-list local/files/cvlist/${i}_cv_spk $data/data_all $data/cv/$i | ||
cat $data/cv/$i/feats.scp >> $data/cv.scp | ||
done | ||
./utils/filter_scp.pl --exclude $data/cv.scp $data/data_all/feats.scp > $data/train_and_dev.scp | ||
#95-5 split for dev set | ||
sed -n '0~20p' $data/train_and_dev.scp > $data/dev.scp | ||
./utils/filter_scp.pl --exclude $data/dev.scp $data/train_and_dev.scp > $data/train.scp | ||
./utils/subset_data_dir.sh --utt-list $data/train.scp $data/data_all $data/train_org | ||
./utils/subset_data_dir.sh --utt-list $data/dev.scp $data/data_all $data/dev_org | ||
./utils/subset_data_dir.sh --utt-list $data/cv.scp $data/data_all $data/cv_all | ||
fi | ||
|
||
echo "local/subset_data.sh succeeded" | ||
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2020 Audio, Speech and Language Processing Group @ NWPU (Author: Xian Shi) | ||
# Apache 2.0 | ||
|
||
. ./path.sh || exit 1; | ||
. ./cmd.sh || exit 1; | ||
|
||
raw_data=$1 # raw data with metadata, txt and wav | ||
data=$2 # data transformed into kaldi format | ||
|
||
# generate kaldi format data for all | ||
if [ -d ${raw_data} ];then | ||
echo "Generating kaldi format data." | ||
mkdir -p $data/data_all | ||
find $raw_data -type f -name "*.wav" > $data/data_all/wavpath | ||
awk -F'/' '{print $(NF-2)"-"$(NF-1)"-"$NF}' $data/data_all/wavpath | sed 's:\.wav::g' > $data/data_all/uttlist | ||
paste $data/data_all/uttlist $data/data_all/wavpath > $data/data_all/wav.scp | ||
python local/preprocess.py $data/data_all/wav.scp $data/data_all/trans $data/data_all/utt2spk # faster than for in shell | ||
./utils/utt2spk_to_spk2utt.pl $data/data_all/utt2spk > $data/data_all/spk2utt | ||
fi | ||
|
||
# clean transcription | ||
if [ -d $data/data_all ];then | ||
echo "Cleaning transcription." | ||
tr '[a-z]' '[A-Z]' < $data/data_all/trans > $data/data_all/trans_upper | ||
# turn "." in specific abbreviations into "<m>" tag | ||
sed -i -e 's: MR\.: MR<m>:g' -e 's: MRS\.: MRS<m>:g' -e 's: MS\.: MS<m>:g' \ | ||
-e 's:^MR\.:MR<m>:g' -e 's:^MRS\.:MRS<m>:g' -e 's:^MS\.:MS<m>:g' $data/data_all/trans_upper | ||
# fix bug | ||
sed -i 's:^ST\.:STREET:g' $data/data_all/trans_upper | ||
sed -i 's: ST\.: STREET:g' $data/data_all/trans_upper | ||
# punctuation marks | ||
sed -i "s%,\|\.\|?\|!\|;\|-\|:\|,'\|\.'\|?'\|!'\| '% %g" $data/data_all/trans_upper | ||
sed -i 's:<m>:.:g' $data/data_all/trans_upper | ||
# blank | ||
sed -i 's:[ ][ ]*: :g' $data/data_all/trans_upper | ||
paste $data/data_all/uttlist $data/data_all/trans_upper > $data/data_all/text | ||
|
||
# critally, must replace tab with space between uttid and text | ||
sed -e "s/\t/ /g" -i $data/data_all/text | ||
fi | ||
|
||
echo "local/data_prep.sh succeeded" | ||
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/usr/bin/env bash | ||
|
||
. ./path.sh || exit 1; | ||
. ./cmd.sh || exit 1; | ||
|
||
zipped_data=$1 | ||
raw_data=$2/Datatang-English/data | ||
|
||
# unzip and rename each accent | ||
unzip $zipped_data -d ${2} | ||
mv $raw_data/American\ English\ Speech\ Data $raw_data/US | ||
mv $raw_data/British\ English\ Speech\ Data $raw_data/UK | ||
mv $raw_data/Chinese\ Speaking\ English\ Speech\ Data $raw_data/CHN | ||
mv $raw_data/Indian\ English\ Speech\ Data $raw_data/IND | ||
mv $raw_data/Portuguese\ Speaking\ English\ Speech\ Data $raw_data/PT | ||
mv $raw_data/Russian\ Speaking\ English\ Speech\ Data $raw_data/RU | ||
mv $raw_data/Japanese\ Speaking\ English\ Speech\ Data $raw_data/JPN | ||
mv $raw_data/Korean\ Speaking\ English\ Speech\ Data $raw_data/KR | ||
mv $raw_data/Canadian\ English\ Speech\ Data $raw_data/CA | ||
mv $raw_data/Spanish\ Speaking\ English\ Speech\ Data $raw_data/ES | ||
|
||
echo "local/download_and_untar.sh succeeded" | ||
exit 0; |
Oops, something went wrong.