From 8aa881bbad0e01f8128725c48a6a480bcb03da2b Mon Sep 17 00:00:00 2001 From: wentao Date: Mon, 21 Feb 2022 08:17:53 +0100 Subject: [PATCH 001/141] change repo name and egs/README.MD --- egs/README.md | 2 +- egs/{lrs => lrs2}/asr1/RESULTS.md | 0 egs/{lrs => lrs2}/asr1/cmd.sh | 0 egs/{lrs => lrs2}/asr1/conf/decode.yaml | 0 egs/{lrs => lrs2}/asr1/conf/fbank.conf | 0 egs/{lrs => lrs2}/asr1/conf/gpu.conf | 0 egs/{lrs => lrs2}/asr1/conf/lm.yaml | 0 egs/{lrs => lrs2}/asr1/conf/pitch.conf | 0 egs/{lrs => lrs2}/asr1/conf/queue.conf | 0 egs/{lrs => lrs2}/asr1/conf/slurm.conf | 0 egs/{lrs => lrs2}/asr1/conf/specaug.yaml | 0 egs/{lrs => lrs2}/asr1/conf/train.yaml | 0 egs/{lrs => lrs2}/asr1/local/README.md | 0 egs/{lrs => lrs2}/asr1/local/data_preparation.sh | 0 egs/{lrs => lrs2}/asr1/local/make_files.py | 0 egs/{lrs => lrs2}/asr1/local/pretrain.py | 0 egs/{lrs => lrs2}/asr1/path.sh | 0 egs/{lrs => lrs2}/asr1/run.sh | 0 egs/{lrs => lrs2}/asr1/steps | 0 egs/{lrs => lrs2}/asr1/utils | 0 20 files changed, 1 insertion(+), 1 deletion(-) rename egs/{lrs => lrs2}/asr1/RESULTS.md (100%) rename egs/{lrs => lrs2}/asr1/cmd.sh (100%) rename egs/{lrs => lrs2}/asr1/conf/decode.yaml (100%) rename egs/{lrs => lrs2}/asr1/conf/fbank.conf (100%) rename egs/{lrs => lrs2}/asr1/conf/gpu.conf (100%) rename egs/{lrs => lrs2}/asr1/conf/lm.yaml (100%) rename egs/{lrs => lrs2}/asr1/conf/pitch.conf (100%) rename egs/{lrs => lrs2}/asr1/conf/queue.conf (100%) rename egs/{lrs => lrs2}/asr1/conf/slurm.conf (100%) rename egs/{lrs => lrs2}/asr1/conf/specaug.yaml (100%) rename egs/{lrs => lrs2}/asr1/conf/train.yaml (100%) rename egs/{lrs => lrs2}/asr1/local/README.md (100%) rename egs/{lrs => lrs2}/asr1/local/data_preparation.sh (100%) rename egs/{lrs => lrs2}/asr1/local/make_files.py (100%) rename egs/{lrs => lrs2}/asr1/local/pretrain.py (100%) rename egs/{lrs => lrs2}/asr1/path.sh (100%) rename egs/{lrs => lrs2}/asr1/run.sh (100%) rename egs/{lrs => lrs2}/asr1/steps (100%) rename egs/{lrs => lrs2}/asr1/utils (100%) diff --git a/egs/README.md b/egs/README.md index 61951b84d47..178bfb03e7b 100755 --- a/egs/README.md +++ b/egs/README.md @@ -49,7 +49,7 @@ See: https://espnet.github.io/espnet/tutorial.html | librispeech | LibriSpeech ASR corpus | ASR | EN | http://www.openslr.org/12 | | | libritts | LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech | TTS | EN | http://www.openslr.org/60/ | | | ljspeech | The LJ Speech Dataset | TTS | EN | https://keithito.com/LJ-Speech-Dataset/ | | -| lrs | The Lip Reading Sentences Dataset | ASR/AVSR | EN | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html | | +| lrs2 | The Lip Reading Sentences 2 Dataset | ASR | EN | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html | | | m_ailabs | The M-AILABS Speech Dataset | TTS | ~5 languages | https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/ | | mucs_2021 | MUCS 2021: MUltilingual and Code-Switching ASR Challenges for Low Resource Indian Languages | ASR/Code Switching | HI, MR, OR, TA, TE, GU, HI-EN, BN-EN | https://navana-tech.github.io/MUCS2021/data.html | | | mtedx | Multilingual TEDx | ASR/Machine Translation/Speech Translation | 13 Language pairs | http://www.openslr.org/100/ | diff --git a/egs/lrs/asr1/RESULTS.md b/egs/lrs2/asr1/RESULTS.md similarity index 100% rename from egs/lrs/asr1/RESULTS.md rename to egs/lrs2/asr1/RESULTS.md diff --git a/egs/lrs/asr1/cmd.sh b/egs/lrs2/asr1/cmd.sh similarity index 100% rename from egs/lrs/asr1/cmd.sh rename to egs/lrs2/asr1/cmd.sh diff --git a/egs/lrs/asr1/conf/decode.yaml b/egs/lrs2/asr1/conf/decode.yaml similarity index 100% rename from egs/lrs/asr1/conf/decode.yaml rename to egs/lrs2/asr1/conf/decode.yaml diff --git a/egs/lrs/asr1/conf/fbank.conf b/egs/lrs2/asr1/conf/fbank.conf similarity index 100% rename from egs/lrs/asr1/conf/fbank.conf rename to egs/lrs2/asr1/conf/fbank.conf diff --git a/egs/lrs/asr1/conf/gpu.conf b/egs/lrs2/asr1/conf/gpu.conf similarity index 100% rename from egs/lrs/asr1/conf/gpu.conf rename to egs/lrs2/asr1/conf/gpu.conf diff --git a/egs/lrs/asr1/conf/lm.yaml b/egs/lrs2/asr1/conf/lm.yaml similarity index 100% rename from egs/lrs/asr1/conf/lm.yaml rename to egs/lrs2/asr1/conf/lm.yaml diff --git a/egs/lrs/asr1/conf/pitch.conf b/egs/lrs2/asr1/conf/pitch.conf similarity index 100% rename from egs/lrs/asr1/conf/pitch.conf rename to egs/lrs2/asr1/conf/pitch.conf diff --git a/egs/lrs/asr1/conf/queue.conf b/egs/lrs2/asr1/conf/queue.conf similarity index 100% rename from egs/lrs/asr1/conf/queue.conf rename to egs/lrs2/asr1/conf/queue.conf diff --git a/egs/lrs/asr1/conf/slurm.conf b/egs/lrs2/asr1/conf/slurm.conf similarity index 100% rename from egs/lrs/asr1/conf/slurm.conf rename to egs/lrs2/asr1/conf/slurm.conf diff --git a/egs/lrs/asr1/conf/specaug.yaml b/egs/lrs2/asr1/conf/specaug.yaml similarity index 100% rename from egs/lrs/asr1/conf/specaug.yaml rename to egs/lrs2/asr1/conf/specaug.yaml diff --git a/egs/lrs/asr1/conf/train.yaml b/egs/lrs2/asr1/conf/train.yaml similarity index 100% rename from egs/lrs/asr1/conf/train.yaml rename to egs/lrs2/asr1/conf/train.yaml diff --git a/egs/lrs/asr1/local/README.md b/egs/lrs2/asr1/local/README.md similarity index 100% rename from egs/lrs/asr1/local/README.md rename to egs/lrs2/asr1/local/README.md diff --git a/egs/lrs/asr1/local/data_preparation.sh b/egs/lrs2/asr1/local/data_preparation.sh similarity index 100% rename from egs/lrs/asr1/local/data_preparation.sh rename to egs/lrs2/asr1/local/data_preparation.sh diff --git a/egs/lrs/asr1/local/make_files.py b/egs/lrs2/asr1/local/make_files.py similarity index 100% rename from egs/lrs/asr1/local/make_files.py rename to egs/lrs2/asr1/local/make_files.py diff --git a/egs/lrs/asr1/local/pretrain.py b/egs/lrs2/asr1/local/pretrain.py similarity index 100% rename from egs/lrs/asr1/local/pretrain.py rename to egs/lrs2/asr1/local/pretrain.py diff --git a/egs/lrs/asr1/path.sh b/egs/lrs2/asr1/path.sh similarity index 100% rename from egs/lrs/asr1/path.sh rename to egs/lrs2/asr1/path.sh diff --git a/egs/lrs/asr1/run.sh b/egs/lrs2/asr1/run.sh similarity index 100% rename from egs/lrs/asr1/run.sh rename to egs/lrs2/asr1/run.sh diff --git a/egs/lrs/asr1/steps b/egs/lrs2/asr1/steps similarity index 100% rename from egs/lrs/asr1/steps rename to egs/lrs2/asr1/steps diff --git a/egs/lrs/asr1/utils b/egs/lrs2/asr1/utils similarity index 100% rename from egs/lrs/asr1/utils rename to egs/lrs2/asr1/utils From 56a6a7381f01172e67d0783dc2504b0e5e78f46c Mon Sep 17 00:00:00 2001 From: wentao Date: Mon, 21 Feb 2022 11:57:38 +0100 Subject: [PATCH 002/141] add lrs avsr --- egs/README.md | 1 + egs/lrs/README.md | 356 +++++ egs/lrs/avsr1/RESULTS.md | 274 ++++ egs/lrs/avsr1/cmd.sh | 89 ++ egs/lrs/avsr1/conf/decode.yaml | 7 + egs/lrs/avsr1/conf/fbank.conf | 2 + egs/lrs/avsr1/conf/gpu.conf | 10 + egs/lrs/avsr1/conf/lm.yaml | 9 + egs/lrs/avsr1/conf/mfcc.conf | 2 + egs/lrs/avsr1/conf/mfcc_hires.conf | 10 + egs/lrs/avsr1/conf/pitch.conf | 1 + egs/lrs/avsr1/conf/queue.conf | 10 + egs/lrs/avsr1/conf/slurm.conf | 12 + egs/lrs/avsr1/conf/specaug.yaml | 16 + egs/lrs/avsr1/conf/train.yaml | 39 + egs/lrs/avsr1/local/LRS3dataprocessing.sh | 113 ++ egs/lrs/avsr1/local/Openface.sh | 69 + egs/lrs/avsr1/local/Openface_vidaug.sh | 67 + egs/lrs/avsr1/local/audio_augmentation.sh | 107 ++ .../avsr1/local/audio_augmentation_recog.sh | 81 + egs/lrs/avsr1/local/audio_data_prep.sh | 94 ++ egs/lrs/avsr1/local/audioaugwav.sh | 29 + egs/lrs/avsr1/local/convertsnr.py | 50 + egs/lrs/avsr1/local/creatsegfile.py | 62 + egs/lrs/avsr1/local/dumpcreate/audiodump.py | 75 + .../local/dumpcreate/avpretraindecodedump.py | 188 +++ .../avsr1/local/dumpcreate/avpretraindump.py | 159 ++ .../local/dumpcreate/avtraindecodedump.py | 204 +++ egs/lrs/avsr1/local/dumpcreate/avtraindump.py | 131 ++ egs/lrs/avsr1/local/dumpcreate/videodump.py | 40 + egs/lrs/avsr1/local/extractfeatures.sh | 18 + egs/lrs/avsr1/local/extractframs.sh | 64 + egs/lrs/avsr1/local/extractsnr.py | 52 + egs/lrs/avsr1/local/extractsnr.sh | 46 + egs/lrs/avsr1/local/extractvfeatures.py | 413 ++++++ .../audio/kaldi_prep_nosegment.py | 70 + .../audio/kaldi_prep_segment.py | 91 ++ .../local/lrs3processing/audio/segmentinfo.py | 157 ++ egs/lrs/avsr1/local/make_video.py | 42 + egs/lrs/avsr1/local/prepaudio.py | 91 ++ egs/lrs/avsr1/local/preppretrainaudio.py | 239 +++ egs/lrs/avsr1/local/pretrain.py | 245 ++++ egs/lrs/avsr1/local/remakewav.py | 21 + egs/lrs/avsr1/local/se_batch.py | 65 + egs/lrs/avsr1/local/segaugaudio.py | 64 + egs/lrs/avsr1/local/segvideo.py | 457 ++++++ egs/lrs/avsr1/local/splitsnr.py | 96 ++ egs/lrs/avsr1/local/training/finetune_av.sh | 172 +++ .../avsr1/local/training/finetune_video.sh | 176 +++ .../avsr1/local/training/finetuneav/asr.py | 1293 ++++++++++++++++ .../local/training/finetuneav/asr_init.py | 250 ++++ .../training/finetuneav/asr_recog_avrms.py | 304 ++++ .../training/finetuneav/asr_train_avrms.py | 654 +++++++++ .../local/training/finetuneav/asr_utils.py | 896 ++++++++++++ .../local/training/finetuneav/attention.py | 76 + .../local/training/finetuneav/batchfy.py | 494 +++++++ .../avsr1/local/training/finetuneav/ctc.py | 182 +++ .../training/finetuneav/ctcattweights.py | 59 + .../local/training/finetuneav/ctcencoder.py | 104 ++ .../avsr1/local/training/finetuneav/dda.py | 47 + .../local/training/finetuneav/decoder.py | 151 ++ .../training/finetuneav/decoder_layer.py | 90 ++ .../finetuneav/e2e_asr_transformer.py | 701 +++++++++ .../local/training/finetuneav/io_utils.py | 695 +++++++++ .../finetuneav/label_smoothing_loss.py | 53 + .../training/finetuneav/lipreadingmodel.py | 228 +++ .../local/training/finetuneav/nets_utils.py | 370 +++++ .../avsr1/local/training/finetuneav/plot.py | 135 ++ .../local/training/finetuneav/rmencoder.py | 103 ++ .../local/training/finetuneav/videoencoder.py | 146 ++ .../training/finetuneav/weighttransfn.py | 78 + .../avsr1/local/training/finetunevideo/asr.py | 1235 ++++++++++++++++ .../training/finetunevideo/asr_recog_video.py | 308 ++++ .../training/finetunevideo/asr_train_video.py | 655 +++++++++ .../local/training/finetunevideo/asr_utils.py | 805 ++++++++++ .../local/training/finetunevideo/batchfy.py | 918 ++++++++++++ .../training/finetunevideo/ctcencoder.py | 111 ++ .../local/training/finetunevideo/decoder.py | 144 ++ .../finetunevideo/e2e_asr_transformer.py | 543 +++++++ .../local/training/finetunevideo/encoder.py | 144 ++ .../local/training/finetunevideo/io_utils.py | 591 ++++++++ .../training/finetunevideo/lipreadingmodel.py | 228 +++ .../local/training/finetunevideo/plot.py | 135 ++ egs/lrs/avsr1/local/training/pretrain_av.sh | 168 +++ .../avsr1/local/training/pretrain_video.sh | 167 +++ .../avsr1/local/training/pretrainav/asr.py | 1298 +++++++++++++++++ .../local/training/pretrainav/asr_init.py | 250 ++++ .../pretrainav/asr_recog_pretrain_av.py | 304 ++++ .../pretrainav/asr_train_pretrain_av.py | 654 +++++++++ .../local/training/pretrainav/asr_utils.py | 862 +++++++++++ .../local/training/pretrainav/attention.py | 76 + .../local/training/pretrainav/batchfy.py | 494 +++++++ .../avsr1/local/training/pretrainav/ctc.py | 184 +++ .../local/training/pretrainav/ctcencoder.py | 111 ++ .../local/training/pretrainav/decoder.py | 152 ++ .../training/pretrainav/decoder_layer.py | 90 ++ .../pretrainav/e2e_asr_transformer.py | 694 +++++++++ .../local/training/pretrainav/io_utils.py | 696 +++++++++ .../pretrainav/label_smoothing_loss.py | 53 + .../local/training/pretrainav/nets_utils.py | 370 +++++ .../avsr1/local/training/pretrainav/plot.py | 135 ++ .../local/training/pretrainav/rmencoder.py | 103 ++ .../local/training/pretrainav/shattention.py | 60 + .../local/training/pretrainav/weightctcfn.py | 46 + .../training/pretrainav/weightsampling.py | 30 + .../training/pretrainav/weighttransfn.py | 78 + .../avsr1/local/training/pretrainvideo/asr.py | 1234 ++++++++++++++++ .../pretrainvideo/asr_recog_videopretrain.py | 304 ++++ .../pretrainvideo/asr_train_videopretrain.py | 652 +++++++++ .../local/training/pretrainvideo/asr_utils.py | 805 ++++++++++ .../local/training/pretrainvideo/batchfy.py | 918 ++++++++++++ .../training/pretrainvideo/ctcencoder.py | 111 ++ .../local/training/pretrainvideo/decoder.py | 144 ++ .../pretrainvideo/e2e_asr_transformer.py | 542 +++++++ .../local/training/pretrainvideo/io_utils.py | 595 ++++++++ .../local/training/pretrainvideo/plot.py | 135 ++ egs/lrs/avsr1/local/training/train_audio.sh | 167 +++ .../avsr1/local/training/trainaudio/asr.py | 1230 ++++++++++++++++ .../training/trainaudio/asr_recog_audio.py | 304 ++++ .../training/trainaudio/asr_train_audio.py | 652 +++++++++ .../local/training/trainaudio/asr_utils.py | 801 ++++++++++ .../local/training/trainaudio/batchfy.py | 918 ++++++++++++ .../local/training/trainaudio/ctcencoder.py | 111 ++ .../local/training/trainaudio/decoder.py | 144 ++ .../trainaudio/e2e_asr_transformer.py | 541 +++++++ .../local/training/trainaudio/io_utils.py | 595 ++++++++ .../avsr1/local/training/trainaudio/plot.py | 135 ++ egs/lrs/avsr1/local/videoaug.py | 68 + egs/lrs/avsr1/path.sh | 18 + egs/lrs/avsr1/run.sh | 1167 +++++++++++++++ egs/lrs/avsr1/steps | 1 + egs/lrs/avsr1/utils | 1 + tools/installers/install_deepxi.sh | 34 + tools/installers/install_openface.sh | 124 ++ tools/installers/install_vidaug.sh | 26 + 135 files changed, 37364 insertions(+) create mode 100644 egs/lrs/README.md create mode 100755 egs/lrs/avsr1/RESULTS.md create mode 100755 egs/lrs/avsr1/cmd.sh create mode 100755 egs/lrs/avsr1/conf/decode.yaml create mode 100755 egs/lrs/avsr1/conf/fbank.conf create mode 100755 egs/lrs/avsr1/conf/gpu.conf create mode 100755 egs/lrs/avsr1/conf/lm.yaml create mode 100755 egs/lrs/avsr1/conf/mfcc.conf create mode 100755 egs/lrs/avsr1/conf/mfcc_hires.conf create mode 100755 egs/lrs/avsr1/conf/pitch.conf create mode 100755 egs/lrs/avsr1/conf/queue.conf create mode 100755 egs/lrs/avsr1/conf/slurm.conf create mode 100755 egs/lrs/avsr1/conf/specaug.yaml create mode 100755 egs/lrs/avsr1/conf/train.yaml create mode 100755 egs/lrs/avsr1/local/LRS3dataprocessing.sh create mode 100755 egs/lrs/avsr1/local/Openface.sh create mode 100755 egs/lrs/avsr1/local/Openface_vidaug.sh create mode 100755 egs/lrs/avsr1/local/audio_augmentation.sh create mode 100755 egs/lrs/avsr1/local/audio_augmentation_recog.sh create mode 100755 egs/lrs/avsr1/local/audio_data_prep.sh create mode 100755 egs/lrs/avsr1/local/audioaugwav.sh create mode 100755 egs/lrs/avsr1/local/convertsnr.py create mode 100755 egs/lrs/avsr1/local/creatsegfile.py create mode 100755 egs/lrs/avsr1/local/dumpcreate/audiodump.py create mode 100755 egs/lrs/avsr1/local/dumpcreate/avpretraindecodedump.py create mode 100755 egs/lrs/avsr1/local/dumpcreate/avpretraindump.py create mode 100755 egs/lrs/avsr1/local/dumpcreate/avtraindecodedump.py create mode 100755 egs/lrs/avsr1/local/dumpcreate/avtraindump.py create mode 100755 egs/lrs/avsr1/local/dumpcreate/videodump.py create mode 100755 egs/lrs/avsr1/local/extractfeatures.sh create mode 100755 egs/lrs/avsr1/local/extractframs.sh create mode 100755 egs/lrs/avsr1/local/extractsnr.py create mode 100755 egs/lrs/avsr1/local/extractsnr.sh create mode 100755 egs/lrs/avsr1/local/extractvfeatures.py create mode 100755 egs/lrs/avsr1/local/lrs3processing/audio/kaldi_prep_nosegment.py create mode 100755 egs/lrs/avsr1/local/lrs3processing/audio/kaldi_prep_segment.py create mode 100755 egs/lrs/avsr1/local/lrs3processing/audio/segmentinfo.py create mode 100755 egs/lrs/avsr1/local/make_video.py create mode 100755 egs/lrs/avsr1/local/prepaudio.py create mode 100755 egs/lrs/avsr1/local/preppretrainaudio.py create mode 100755 egs/lrs/avsr1/local/pretrain.py create mode 100755 egs/lrs/avsr1/local/remakewav.py create mode 100755 egs/lrs/avsr1/local/se_batch.py create mode 100755 egs/lrs/avsr1/local/segaugaudio.py create mode 100755 egs/lrs/avsr1/local/segvideo.py create mode 100755 egs/lrs/avsr1/local/splitsnr.py create mode 100755 egs/lrs/avsr1/local/training/finetune_av.sh create mode 100755 egs/lrs/avsr1/local/training/finetune_video.sh create mode 100755 egs/lrs/avsr1/local/training/finetuneav/asr.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/asr_init.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/asr_recog_avrms.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/asr_train_avrms.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/asr_utils.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/attention.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/batchfy.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/ctc.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/ctcattweights.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/ctcencoder.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/dda.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/decoder.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/decoder_layer.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/e2e_asr_transformer.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/io_utils.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/label_smoothing_loss.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/lipreadingmodel.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/nets_utils.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/plot.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/rmencoder.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/videoencoder.py create mode 100755 egs/lrs/avsr1/local/training/finetuneav/weighttransfn.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/asr.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/asr_recog_video.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/asr_train_video.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/asr_utils.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/batchfy.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/ctcencoder.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/decoder.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/e2e_asr_transformer.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/encoder.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/io_utils.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/lipreadingmodel.py create mode 100755 egs/lrs/avsr1/local/training/finetunevideo/plot.py create mode 100755 egs/lrs/avsr1/local/training/pretrain_av.sh create mode 100755 egs/lrs/avsr1/local/training/pretrain_video.sh create mode 100755 egs/lrs/avsr1/local/training/pretrainav/asr.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/asr_init.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/asr_recog_pretrain_av.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/asr_train_pretrain_av.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/asr_utils.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/attention.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/batchfy.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/ctc.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/ctcencoder.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/decoder.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/decoder_layer.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/e2e_asr_transformer.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/io_utils.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/label_smoothing_loss.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/nets_utils.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/plot.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/rmencoder.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/shattention.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/weightctcfn.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/weightsampling.py create mode 100755 egs/lrs/avsr1/local/training/pretrainav/weighttransfn.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/asr.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/asr_recog_videopretrain.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/asr_train_videopretrain.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/asr_utils.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/batchfy.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/ctcencoder.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/decoder.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/e2e_asr_transformer.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/io_utils.py create mode 100755 egs/lrs/avsr1/local/training/pretrainvideo/plot.py create mode 100755 egs/lrs/avsr1/local/training/train_audio.sh create mode 100755 egs/lrs/avsr1/local/training/trainaudio/asr.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/asr_recog_audio.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/asr_train_audio.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/asr_utils.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/batchfy.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/ctcencoder.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/decoder.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/e2e_asr_transformer.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/io_utils.py create mode 100755 egs/lrs/avsr1/local/training/trainaudio/plot.py create mode 100755 egs/lrs/avsr1/local/videoaug.py create mode 100755 egs/lrs/avsr1/path.sh create mode 100755 egs/lrs/avsr1/run.sh create mode 120000 egs/lrs/avsr1/steps create mode 120000 egs/lrs/avsr1/utils create mode 100755 tools/installers/install_deepxi.sh create mode 100755 tools/installers/install_openface.sh create mode 100755 tools/installers/install_vidaug.sh diff --git a/egs/README.md b/egs/README.md index 178bfb03e7b..e46c4f56cd4 100755 --- a/egs/README.md +++ b/egs/README.md @@ -50,6 +50,7 @@ See: https://espnet.github.io/espnet/tutorial.html | libritts | LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech | TTS | EN | http://www.openslr.org/60/ | | | ljspeech | The LJ Speech Dataset | TTS | EN | https://keithito.com/LJ-Speech-Dataset/ | | | lrs2 | The Lip Reading Sentences 2 Dataset | ASR | EN | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html | | +| lrs | The Lip Reading Sentences 2 and 3 Dataset | AVSR | EN | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html | | | m_ailabs | The M-AILABS Speech Dataset | TTS | ~5 languages | https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/ | | mucs_2021 | MUCS 2021: MUltilingual and Code-Switching ASR Challenges for Low Resource Indian Languages | ASR/Code Switching | HI, MR, OR, TA, TE, GU, HI-EN, BN-EN | https://navana-tech.github.io/MUCS2021/data.html | | | mtedx | Multilingual TEDx | ASR/Machine Translation/Speech Translation | 13 Language pairs | http://www.openslr.org/100/ | diff --git a/egs/lrs/README.md b/egs/lrs/README.md new file mode 100644 index 00000000000..a802ba59542 --- /dev/null +++ b/egs/lrs/README.md @@ -0,0 +1,356 @@ +# ESPnet-AVSR + +## Introduction +This repository contains an implementation of end-to-end (E2E) audio-visual speech recognition (AVSR) based on the ESPnet ASR toolkit. The new fusion strategy follows the paper "Fusing information streams in end-to-end audio-visual speech recognition." (https://ieeexplore.ieee.org/document/9414553) [[1]](#literature). A broad range of reliability measures are used to help the integration model improve the performance of the AVSR model. We use two large-vocabulary datasets, the Lip Reading Sentences 2 and 3 corpora for all our experiments. +In addition, this project also contains an audio-only model for comparison. + +## Table of Contents +- [Installation](#installation-of-required-packages) + * [Requirements](#requirements) +- [Project Structure](#project-structure) + * [Basics](#project-structure) + * [ASR1](#detailed-description-of-asr1) + * [AVSR1](#detailed-description-of-avsr1) +- [Usage of the scripts](#running-the-script) + + [Path variables](#setting-path-variables) + + [Notes](#notes) + + +## Installation of required packages + +### Requirements + +For installation, approximately 40GB of free disk space are needed. +The required packes are listed below: + +**Optional, if Cuda capable grafic card:** +1. Cuda (Version 10.0): https://developer.nvidia.com/cuda-toolkit +2. CudaNN (Cudnn >= 7.6) : https://developer.nvidia.com/cudnn + +**Required Packages:** +1. ESPNet: https://github.com/espnet/espnet +1. OpenFace: https://github.com/TadasBaltrusaitis/OpenFace +2. DeepXi: https://github.com/anicolson/DeepXi +3. Vidaug: https://github.com/okankop/vidaug + +**After installations are completed, please set path variables for OpenFace, DeepXi, and Vidaug in avsr1/path.sh!** + + + +## Project structure +The project is divided into two main folders. The first one asr1/ which contains an audio-only speech recognition model trained on the LRS2 dataset (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html) [[2]](#literature). The other main folder avsr1/, contains the code for the audio-visual speech recognition system, also trained on the LRS2 [[2]](#literature) dataset together with the LRS3 dataset (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html) [[3]](#literature) . Both systems follow the basic ESPnet structure. +The main code for every recognition system is the run.sh script. In those scripts, the workflow of the systems is performed in multiple stages: + +| ASR | AVSR | +|---------------------------------------------------|-------------------------------------------------------------| +| Stage -1: Data Download | Stage -1: Data Download and preparation | +| Stage 0: Data Preparation in Kaldi-style | Stage 0: Audio augmentation | +| Stage 1: Feature Generation | Stage 1: MP3 files and Feature Generation | +| Stage 2: Dictionary and JSON data preparation | Stage 2: Dictionary and JSON data preparation | +| Stage 3: Language model training | Stage 3: Reliability measures generation | +| Stage 4: Training of the E2E-ASR model | Stage 4: Language model trainin | +| Stage 5: Decoding | Stage 5: Training of the E2E-AVSR model and Decoding | + + +The folder structure for both systems is basically: +* conf/: contains configuration files for the training, decoding, and feature extraction +* data/: directory for storing data +* exp/: log files, model parameters, training results +* fbank/: speech feature binary files, e.g., ark, scp +* dump*/ : ESPnet meta data for tranining, e.g., json, hdf5 +* local/: Contains local runtime scripts for data processing, data augmentation and own written fucntions (e.g. face recoginiton in the AVSR system) that are not part of the ESPnet standard processing scripts +* steps/: helper scripts from ESPnet (Kaldi) +* utils/: helper scripts from ESPnet (Kaldi) + +### Detailed description of ASR1: +##### Stage -1: Data Download + * Strictly considered not a separate stage, since the data set must be downloaded in advance by yourself. For downloading the dataset, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html/' [[2]](#literature) + * You will need to sign a data sharing agreement with BBC Research & Development before getting access + * After downloading, please edit path.sh file and assign the dataset directory path to the DATA_DIR variable + +##### Stage 0: Data Preparation in Kaldi-Style + * For every dataset part (pretrain, train, test, validate), prepare the data in Kaldi-Style + * More information about Kaldi-Style: https://kaldi-asr.org/doc/data_prep.html + * Segmentation: If the variable segment is true, the data in the pretrain set will be segmented into files with length of 5s to restrict the length of the data + * Generates the text, utt2spk and wav.scp files + +##### Stage 1: Feature Generation + * Generate the fillter bank features, by default 80-dimensional filter banks with pitch on each frame + * Cepstral mean and variance normalization + +##### Stage 2: Dictionary and JSON data preparation + * prepare a dictionary and save the data prepared in the previous steps as .json files + * If a pretrained language model is used, the dictionary data is replaced + +##### Stage 3: Language Model Trainingg + * Train your own language model on the librispeech dataset (https://www.openslr.org/11/) or use a pretrained language model + * It is possible to skip the language model and use the system without an external language model. For this, just remove the rnnlm from the decoding stage (5) + +##### Stage 4: Training + * Training of the ASR E2E system by using pretrain and train set + +##### Stage 5: Decoding + * Decoding of the test and validation set + +### Detailed description of AVSR1: + +##### Stage -1: Data preparation + * The data set LRS2 [2] must be downloaded in advance by yourself. For downloading the dataset, please visit https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html/ [2]. You will need to sign a data sharing agreement with BBC Research & Development before getting access. After downloading, please edit path.sh file and assign the dataset directory path to the DATA_DIR variable + * The same applies to the LRS3 dataset https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html [3]. After downloading, please edit path.sh file and assign the dataset directory path to the DATALRS3_DIR variable + * Download the Musan dataset for audio data augmentation and save it under ${MUSAN_DIR} directory + * Donwload Room Impulse Response and Noise Database (RIRS-Noises) and save it under RIRS_NOISES/ directory + * Run audio_data_prep.sh script: Create filelists for the given part of the Dataset, prepare the kaldi files + * Dump useful data for training + +##### Stage 0: Audio Augmentation + * Augment the audio data with RIRS Noise + * Augment the audio data with Musan Noise + * The augmented files are saved under data/audio/augment whereas the clear audio files can be found in data/audio/clear for all the used datasets (Test, Validation(Val), Train and optional Pretrain) + +##### Stage 1: Feature Generation + * Make augmented MP3 files + * Generate the fbank and mfcc features for the audio signals. By default, 80-dimensional filterbanks with pitch on each frame are used + * Compute global Cepstral mean and variance normalization (CMVN). This computes goodness of pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks (https://kaldi-asr.org/doc/compute-cmvn-stats_8cc.html). + +##### Stage 2: Dictionary and JSON data preparation + * Build Dictionary and JSON Data Preparation + * Build a tokenizer using Sentencepiece: https://github.com/google/sentencepiece + +##### Stage 3: Reliability measures generation + * Stage 3.0: Creat dump file for MFCC features + * Stage 3.1: Video augmentation with Gaussian blur and salt&pepper noise + * Stage 3.2: OpenFace face recognition for facial recognition (especially the mouth region, for further details see documentation in avsr1/local folder ) + * Stage 3.3: Extract video frames + * Stage 3.4: Estimate SNRs using DeepXi framework + * Stage 3.5: Extract video features by pretrained video feature extractor [[4]](#literature) + * Stage 3.6: Make video .ark files + * Stage 3.7: Remake audio and video dump files + * Stage 3.8: Split test decode dump files by different signal-to-noise ratios + +##### Stage 4: Language Model Training + * Train your own language model on the librispeech dataset (https://www.openslr.org/11/) or use a pretrained language model + * It is possible to skip the language model and use the system without an external language model. For this, just remove the rnnlm from the decoding stage (5) + +##### Stage 5: Network Training + * Train audio model + * Pretrain video model + * Finetune video model + * Pretrain av model + * Finetune av model (model used for decoding) + +##### Stage 6: Decoding + +##### Other important references: + * Explanation of the CSV-file for OpenFace: https://github.com/TadasBaltrusaitis/OpenFace/wiki/Output-Format#featureextraction + + +## Running the script +The runtime script is the script **run.sh**. It can be found in avsr1/ directory. +> Before running the script, please download the LRS2 (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html) [[2]](#literature) and LRS3 (https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html) [[3]](#literature) datasets by yourself and save the download paths to the variables DATA_DIR (LRS2 path) and DATALRS3_DIR (LRS3 path) inside path.sh file. +You will need to sign a data sharing agreement with BBC Research & Development before getting access. + +### Setting Path Variables +The following path variables need to be set in path.sh file in advance before running the script (not all are necessary for audio-only system): + * **MAIN_ROOT:** this is the directory where ESPnet is installed (e.g /home/foo/AVSR/ESPnet) + * **ESPNET_VENV:** if the name of the virtual environment for ESPnet is not venv, change the variable ESPNET_VENV to the name of the environment + * **DATA_DIR:** the LRS2 dataset directory (e.g. /home/foo/LSR2) + * **DATALRS3_DIR** the LRS3 dataset directory (e.g. /home/foo/LRS3), used for pretraining + * **OPENFACE_DIR:** OpenFace build directory (e.g. /home/foo/AVSR/OpenFace/build/bin) + * **VIDAUG_DIR**: Path to vidaug directory if it is not installed in espnet virtual environment + * **DEEPXI_DIR:** DeepXi directory (e.g. /home/foo/AVSR/DeepXi) + * **DEEPXI_VENVDIR:** DeepXi virtual environment directory (e.g. /home/foo/AVSR/DeepXi/bin/activate) + * **MUSAN_DIR:** The noise dataset directory (e.g. musan) + * **PRETRAINEDMODEL**: Path to pretrained video model + +### Notes +Due to the long runtime, it could be useful to run the script using screen command in combination with monitoring in a terminal window and also redirect the output to a log file. + +Screen is a terminal multiplexer which means that you can start any number of virtual terminals inside the current terminal session. The advantage is, that you can detach virtual terminals so that they are running in the background. Furthermore, the processes keep still running, even if you are closing the main session or close an ssh connection if you are working remote on a server. +Screen can be installed from the official package repositories via +```console +foo@bar:~$ sudo apt install screen +``` +As an example, to redirect the output into a file named "log_run_sh.txt", the script could be started with: +```console +foo@bar:~/avsr1$ screen bash -c 'bash run.sh |& tee -a log_run_sh.txt' +``` +This will start a virtual terminal session which is executing and monitoring the run.sh file. The output is printed to this session as well as saved into the file "log_run_sh.txt". You can leave the monitoring session by simply pressing ctrl+A+D. If you want to return to the process, simply type +```console +foo@bar:~$ screen -ls +``` +into a terminal to see all running screen processes with their corresponding ID. Then execute +```console +foo@bar:~$ screen -r [ID] +``` +to return to the process. +Source: https://wiki.ubuntuusers.de/Screen/ + +*** +### Literature + +[1] W. Yu, S. Zeiler and D. Kolossa, "Fusing Information Streams in End-to-End Audio-Visual Speech Recognition," ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2021, pp. 3430-3434, doi: 10.1109/ICASSP39728.2021.9414553. + +[2] T. Afouras, J. S. Chung, A. Senior, O. Vinyals, A. Zisserman
+Deep Audio-Visual Speech Recognition +arXiv: 1809.02108 + +[3] T. Afouras, J. S. Chung, A. Zisserman
+LRS3-TED: a large-scale dataset for visual speech recognition +arXiv preprint arXiv: 1809.00496 + +[4] S. Petridis, T. Stafylakis, P. Ma, G. Tzimiropoulos, andM. Pantic, “Audio-visual speech recognition with a hybridCTC/Attention architecture,” in IEEE SLT. IEEE, 2018. + diff --git a/egs/lrs/avsr1/RESULTS.md b/egs/lrs/avsr1/RESULTS.md new file mode 100755 index 00000000000..654e029a61c --- /dev/null +++ b/egs/lrs/avsr1/RESULTS.md @@ -0,0 +1,274 @@ +## pretrain_Train_pytorch_audio_delta_specaug (Audio-Only) + +* Model files (archived to model.tar.gz by $ pack_model.sh) + - download link: https://drive.google.com/file/d/1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi/view + - training config file: conf/train.yaml + - decoding config file: conf/decode.yaml + - preprocess config file: conf/specaug.yaml + - lm config file: conf/lm.yaml + - cmvn file: data/train/cmvn.ark + - e2e file: exp/audio/model.last10.avg.best + - e2e json file: exp/audio/model.json + - lm file: exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best + - lm JSON file: exp/train_rnnlm_pytorch_lm_unigram500/model.json + - dict file: data/lang_char/train_unigram500_units.txt + + +### CER + +|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---|---| +|music noise|-12|171|3828|15.7|25.9|58.5|0.1|84.4|100.0| +||-9 |187|4085|19.1|25.5|55.4|0.0|81.0|100.0| +||-6 |176|3922|20.8|25.0|54.3|0.0|79.2|100.0| +||-3 |201|4641|19.3|24.4|56.3|0.0|80.8|100.0| +|| 0 |158|3495|19.7|25.6|54.6|0.0|80.3|100.0| +|| 3 |173|3668|20.6|24.8|54.5|0.1|79.5|100.0| +|| 6 |185|4145|21.0|24.5|54.4|0.0|79.0|100.0| +|| 9 |157|3393|19.9|24.8|55.3|0.1|80.2|100.0| +||12 |150|3374|19.8|24.9|55.3|0.0|80.2|100.0| +||clean |138|3062|20.0|24.7|55.3|0.0|80.0|100.0| +||reverb |177|3898|19.3|24.8|55.9|0.0|80.7|100.0| +|ambient noise|-12|187|4095|18.1|25.1|56.7|0.2|82.0|100.0| +||-9 |193|4277|18.5|25.7|55.8|0.1|81.6|100.0| +||-6 |176|4128|18.7|25.7|55.6|0.0|81.3|100.0| +||-3 |173|4056|19.0|25.2|55.8|0.1|81.1|100.0| +|| 0 |148|3328|18.7|24.7|56.6|0.1|81.5|100.0| +|| 3 |176|3758|20.3|24.7|55.0|0.0|79.7|100.0| +|| 6 |166|3581|21.6|24.9|53.5|0.0|78.4|100.0| +|| 9 |170|3600|18.9|25.0|56.1|0.0|81.1|100.0| +||12 |169|3728|20.9|24.5|54.6|0.0|79.2|100.0| +||clean |138|3062|20.0|24.7|55.3|0.0|80.0|100.0| +||reverb |177|3898|19.3|24.8|55.9|0.0|80.7|100.0| + +### WER + +|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---|---| +|music noise|-12|171|912|83.4|12.5|4.1|2.4|19.0|38.6| +||-9 |187|1005|87.6|8.6|3.9|1.9|14.3|33.2| +||-6 |176|951|90.6|5.9|3.5|0.8|10.2|26.7| +||-3 |201|1097|94.4|3.3|2.3|0.6|6.2|20.4| +|| 0 |158|847|94.9|3.2|1.9|0.4|5.4|19.0| +|| 3 |173|884|94.2|3.8|1.9|0.6|6.3|24.9| +|| 6 |185|997|96.3|2.7|1.0|0.7|4.4|17.8| +|| 9 |157|817|96.9|1.7|1.3|0.4|3.4|13.4| +||12 |150|832|95.2|2.9|1.9|0.5|5.3|20.7| +||clean |138|739|95.7|2.4|1.9|0.4|4.7|17.4| +||reverb |177|943|93.6|4.0|2.3|0.4|6.8|23.2| +|ambient noise|-12|187|995|73.7|18.4|7.9|1.7|28.0|51.9| +||-9 |193|1060|83.0|11.7|5.3|1.4|18.4|40.4| +||-6 |176|971|90.2|6.8|3.0|1.4|11.2|26.1| +||-3 |173|972|90.0|6.9|3.1|1.0|11.0|32.9| +|| 0 |148|838|94.0|4.1|1.9|0.4|6.3|23.6| +|| 3 |176|909|95.5|2.9|1.7|0.3|4.8|17.0| +|| 6 |166|830|94.1|3.3|2.7|1.0|6.9|20.5| +|| 9 |170|872|95.4|3.1|1.5|0.2|4.8|18.2| +||12 |169|895|95.0|4.0|1.0|0.2|5.3|20.1| +||clean |138|739|95.7|2.4|1.9|0.4|4.7|17.4| +||reverb |177|943|93.6|4.0|2.3|0.4|6.8|23.2| + +## Train_pytorch_trainvideo_delta_specaug (Video-Only) + +* Model files (archived to model.tar.gz by $ pack_model.sh) + - download link: https://drive.google.com/file/d/1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi/view + - training config file: conf/finetunevideo/trainvideo.yaml + - decoding config file: conf/decode.yaml + - preprocess config file: conf/specaug.yaml + - lm config file: conf/lm.yaml + - e2e file: exp/vfintune/model.last10.avg.best + - e2e json file: exp/vfintune/model.json + - lm file: exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best + - lm JSON file: exp/train_rnnlm_pytorch_lm_unigram500/model.json + - dict file: data/lang_char/train_unigram500_units.txt + + +### CER + +|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---|---| +|clean visual data|-12|171|3828|10.7|25.1|64.2|0.3|89.6|100.0| +||-9 |187|4085|12.6|26.1|61.2|0.2|87.5|100.0| +||-6 |176|3922|13.4|27.3|59.3|0.4|87.0|100.0| +||-3 |201|4641|11.8|26.8|61.4|0.3|88.6|100.0| +|| 0 |158|3495|11.6|26.1|62.3|0.2|88.6|100.0| +|| 3 |173|3668|12.0|27.3|60.7|0.2|88.2|100.0| +|| 6 |185|4145|12.5|26.0|61.4|0.3|87.7|100.0| +|| 9 |157|3393|12.5|26.2|61.4|0.3|87.8|100.0| +||12 |150|3374|12.1|25.5|62.4|0.3|88.2|100.0| +||clean |138|3062|12.4|25.2|62.3|0.3|87.9|100.0| +||reverb |177|3898|12.2|25.2|62.6|0.3|88.1|100.0| +|visual gaussian blur|-12|187|4095|11.7|25.6|62.6|0.4|88.7|100.0| +||-9 |193|4277|11.7|26.4|62.0|0.2|88.5|100.0| +||-6 |176|4128|11.1|25.8|63.0|0.2|89.1|100.0| +||-3 |173|4056|11.9|27.3|60.8|0.2|88.4|100.0| +|| 0 |148|3328|11.2|25.5|63.3|0.1|88.9|100.0| +|| 3 |176|3758|11.3|25.7|63.0|0.3|89.0|100.0| +|| 6 |166|3581|11.9|27.0|61.1|0.4|88.6|100.0| +|| 9 |170|3600|11.8|26.6|61.5|0.4|88.6|100.0| +||12 |169|3728|11.0|26.6|62.4|0.5|89.5|100.0| +||clean |138|3062|11.8|25.1|63.1|0.3|88.5|100.0| +||reverb |177|3898|11.5|25.1|63.4|0.3|88.8|100.0| +|visual salt and pepper noise|-12|187|4095|10.7|27.3|62.0|0.3|89.5|100.0| +||-9 |193|4277|11.9|28.7|59.4|0.4|88.6|100.0| +||-6 |176|4128|10.5|27.4|62.1|0.3|89.8|100.0| +||-3 |173|4056|11.9|27.5|60.6|0.3|88.5|100.0| +|| 0 |148|3328|12.0|25.5|62.5|0.3|88.3|100.0| +|| 3 |176|3758|10.9|27.1|62.0|0.3|89.5|100.0| +|| 6 |166|3581|11.9|27.2|60.9|0.3|88.4|100.0| +|| 9 |170|3600|11.7|27.0|61.4|0.4|88.7|100.0| +||12 |169|3728|10.5|28.2|61.3|0.3|89.9|100.0| +||clean |138|3062|11.5|26.7|61.8|0.5|88.9|100.0| +||reverb |177|3898|11.4|26.5|62.1|0.4|89.0|100.0| + +### WER + +|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---|---| +|clean visual data|-12|171|912|39.4|42.7|18.0|4.3|64.9|89.5| +||-9 |187|1005|43.7|40.6|15.7|5.4|61.7|86.1| +||-6 |176|951|43.3|42.6|14.1|4.1|60.8|88.6| +||-3 |201|1097|41.3|44.2|14.5|5.3|64.0|85.6| +|| 0 |158|847|44.3|37.8|17.9|6.1|61.9|85.4| +|| 3 |173|884|44.2|39.7|16.1|5.3|61.1|84.4| +|| 6 |185|997|38.2|44.8|17.0|3.9|65.7|84.9| +|| 9 |157|817|47.9|37.1|15.1|5.5|57.6|80.3| +||12 |150|832|42.9|37.6|19.5|5.3|62.4|84.0| +||clean |138|739|45.9|39.1|15.0|5.3|59.4|85.5| +||reverb |177|943|43.4|40.5|16.1|5.3|61.9|85.9| +|visual Gaussian blur|-12|187|995|35.9|45.4|18.7|5.3|69.4|86.6| +||-9 |193|1060|35.0|44.2|20.8|5.0|70.0|92.2| +||-6 |176|971|38.2|43.2|18.6|4.6|66.4|87.5| +||-3 |173|972|37.9|45.5|16.7|4.8|67.0|86.1| +|| 0 |148|838|38.1|40.7|21.2|4.2|66.1|89.2| +|| 3 |176|909|36.0|48.5|15.5|5.9|70.0|88.6| +|| 6 |166|830|36.7|46.6|16.6|6.1|69.4|89.8| +|| 9 |170|872|39.0|45.5|15.5|4.7|65.7|87.6| +||12 |169|895|35.2|46.8|18.0|4.6|69.4|89.9| +||clean |138|739|40.7|42.2|17.1|5.0|64.3|88.4| +||reverb |177|943|38.0|44.3|17.7|5.0|67.0|89.3| +|visual salt and pepper noise|-12|187|995|32.5|48.9|18.6|4.6|72.2|83.4| +||-9 |193|1060|32.3|51.5|16.2|6.1|73.9|92.2| +||-6 |176|971|36.5|47.3|16.3|7.2|70.8|86.4| +||-3 |173|972|35.5|47.2|17.3|4.6|69.1|88.4| +|| 0 |148|838|36.9|41.5|21.6|3.7|66.8|88.5| +|| 3 |176|909|33.0|51.9|15.1|5.4|72.4|88.6| +|| 6 |166|830|35.3|49.9|14.8|8.8|73.5|88.0| +|| 9 |170|872|41.2|43.3|15.5|5.6|64.4|84.7| +||12 |169|895|34.2|47.8|18.0|7.3|73.1|91.1| +||clean |138|739|37.5|47.8|14.7|7.3|69.8|86.2| +||reverb |177|943|35.9|47.9|16.1|6.7|70.7|87.0| + +## Train_pytorch_trainavs_delta_specaug (Audio-Visual) + +* Model files (archived to model.tar.gz by $ pack_model.sh) + - download link: https://drive.google.com/file/d/1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi/view + - training config file: conf/finetuneav/trainavs.yaml + - decoding config file: conf/decode.yaml + - preprocess config file: conf/specaug.yaml + - lm config file: conf/lm.yaml + - cmvn file: data/train/cmvn.ark + - e2e file: exp/avfintune/model.last10.avg.best + - e2e json file: exp/avfintune/model.json + - lm file: exp/train_rnnlm_pytorch_lm_unigram500/rnnlm.model.best + - lm JSON file: exp/train_rnnlm_pytorch_lm_unigram500/model.json + - dict file: data/lang_char/train_unigram500_units.txt + + +### CER + +|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---|---| +|music noise with clean visual data |-12|171|3828|16.7|25.5|57.8|0.0|83.4|100.0| +||-9 |187|4085|20.5|24.8|54.7|0.0|79.6|100.0| +||-6 |176|3922|21.1|24.6|54.3|0.0|78.9|100.0| +||-3 |201|4641|19.8|24.4|55.7|0.0|80.2|100.0| +|| 0 |158|3495|19.9|25.4|54.6|0.0|80.1|100.0| +|| 3 |173|3668|21.0|25.0|54.1|0.0|79.1|100.0| +|| 6 |185|4145|21.1|24.3|54.5|0.0|78.9|100.0| +|| 9 |157|3393|19.6|25.0|55.4|0.0|80.4|100.0| +||12 |150|3374|20.0|24.7|55.2|0.0|80.0|100.0| +||clean |138|3062|20.2|24.8|55.1|0.0|79.8|100.0| +||reverb |177|3898|19.7|24.7|55.7|0.0|80.3|100.0| +|ambient noise with clean visual data |-12|187|4095|19.5|24.6|55.9|0.1|80.6|100.0| +||-9 |193|4277|19.4|25.2|55.4|0.0|80.6|100.0| +||-6 |176|4128|18.8|25.2|56.0|0.0|81.2|100.0| +||-3 |173|4056|19.6|25.2|55.2|0.0|80.4|100.0| +|| 0 |148|3328|18.9|24.4|56.7|0.0|81.1|100.0| +|| 3 |176|3758|20.7|24.7|54.6|0.0|79.3|100.0| +|| 6 |166|3581|22.0|25.1|52.9|0.0|78.0|100.0| +|| 9 |170|3600|19.0|25.0|56.0|0.0|81.0|100.0| +||12 |169|3728|20.7|24.5|54.8|0.0|79.3|100.0| +||clean |138|3062|20.2|24.8|55.1|0.0|79.8|100.0| +||reverb |177|3898|19.7|24.7|55.7|0.0|80.3|100.0| +|ambient noise with visual Gaussian blur|-12|187|4095|19.0|24.5|56.5|0.1|81.0|100.0| +||-9 |193|4277|19.6|25.2|55.2|0.0|80.4|100.0| +||-6 |176|4128|18.7|25.2|56.1|0.0|81.3|100.0| +||-3 |173|4056|19.7|25.2|55.1|0.0|80.4|100.0| +|| 0 |148|3328|18.9|24.4|56.7|0.0|81.1|100.0| +|| 3 |176|3758|20.6|24.6|54.7|0.0|79.4|100.0| +|| 6 |166|3581|21.9|25.1|53.0|0.0|78.1|100.0| +|| 9 |170|3600|19.0|24.9|56.1|0.0|81.0|100.0| +||12 |169|3728|20.5|24.6|54.9|0.0|79.5|100.0| +||clean |138|3062|20.2|24.8|55.1|0.0|79.8|100.0| +||reverb |177|3898|19.7|24.5|55.8|0.0|80.3|100.0| +|ambient noise with visual salt and pepper noise|-12|187|4095|19.0|24.8|56.2|0.1|81.0|100.0| +||-9 |193|4277|19.6|25.5|54.9|0.0|80.4|100.0| +||-6 |176|4128|18.7|25.1|56.2|0.0|81.3|100.0| +||-3 |173|4056|19.7|25.5|54.8|0.0|80.3|100.0| +|| 0 |148|3328|18.9|24.4|56.7|0.0|81.1|100.0| +|| 3 |176|3758|20.8|24.6|54.6|0.0|79.2|100.0| +|| 6 |166|3581|22.0|25.1|52.9|0.0|78.1|100.0| +|| 9 |170|3600|19.0|24.9|56.1|0.0|81.0|100.0| +||12 |169|3728|20.8|24.7|54.6|0.0|79.2|100.0| +||clean |138|3062|20.2|24.9|55.0|0.0|79.8|100.0| +||reverb |177|3898|19.7|24.6|55.7|0.0|80.3|100.0| + +### WER + +|dataset|SNR in dB|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---|---| +|music noise with clean visual data |-12|171|912|91.2|6.0|2.7|1.5|10.3|26.3| +||-9 |187|1005|93.2|4.5|2.3|0.4|7.2|25.1| +||-6 |176|951|94.1|3.7|2.2|0.3|6.2|18.8| +||-3 |201|1097|95.2|2.7|2.1|0.4|5.2|15.9| +|| 0 |158|847|96.7|2.2|1.1|0.4|3.7|13.9| +|| 3 |173|884|95.6|2.6|1.8|0.3|4.8|17.9| +|| 6 |185|997|95.5|2.3|2.2|0.7|5.2|18.9| +|| 9 |157|817|96.2|2.1|1.7|0.7|4.5|14.0| +||12 |150|832|95.1|2.4|2.5|0.2|5.2|21.3| +||clean |138|739|97.2|1.5|1.4|0.4|3.2|13.8| +||reverb |177|943|96.0|1.8|2.2|0.3|4.3|16.4| +|ambient noise with clean visual data |-12|187|995|90.4|6.9|2.7|1.1|10.8|31.0| +||-9 |193|1060|91.3|5.6|3.1|1.4|10.1|29.0| +||-6 |176|971|94.4|2.9|2.7|0.3|5.9|21.0| +||-3 |173|972|93.7|3.7|2.6|0.1|6.4|22.0| +|| 0 |148|838|95.7|2.0|2.3|0.1|4.4|16.9| +|| 3 |176|909|97.0|1.5|1.4|0.3|3.3|12.5| +|| 6 |166|830|96.0|1.9|2.0|0.6|4.6|16.3| +|| 9 |170|872|95.6|3.4|0.9|0.2|4.6|17.1| +||12 |169|895|94.0|3.7|2.3|0.4|6.5|20.7| +||clean |138|739|97.2|1.5|1.4|0.4|3.2|13.8| +||reverb |177|943|96.0|1.8|2.2|0.3|4.3|16.4| +|ambient noise with visual Gaussian blur|-12|187|995|87.0|9.1|3.8|1.0|14.0|35.8| +||-9 |193|1060|90.6|6.2|3.2|1.1|10.6|30.1| +||-6 |176|971|93.2|3.6|3.2|0.3|7.1|24.4| +||-3 |173|972|94.0|3.6|2.4|0.1|6.1|21.4| +|| 0 |148|838|95.6|2.3|2.1|0.2|4.7|17.6| +|| 3 |176|909|96.3|1.7|2.1|0.3|4.1|13.6| +|| 6 |166|830|95.4|2.3|2.3|0.6|5.2|18.1| +|| 9 |170|872|95.6|3.1|1.3|0.2|4.6|16.5| +||12 |169|895|93.2|4.4|2.5|0.4|7.3|23.1| +||clean |138|739|97.0|1.5|1.5|0.4|3.4|14.5| +||reverb |177|943|95.7|1.7|2.7|0.3|4.7|16.9| +|ambient noise with visual salt and pepper noise|-12|187|995|87.1|8.8|4.0|0.9|13.8|35.8| +||-9 |193|1060|90.5|6.3|3.2|1.1|10.7|30.6| +||-6 |176|971|93.3|3.2|3.5|0.3|7.0|24.4| +||-3 |173|972|94.7|3.8|1.5|0.2|5.6|20.2| +|| 0 |148|838|95.3|2.4|2.3|0.2|4.9|18.2| +|| 3 |176|909|96.8|1.4|1.8|0.3|3.5|13.1| +|| 6 |166|830|95.9|2.2|1.9|0.7|4.8|17.5| +|| 9 |170|872|95.6|3.1|1.3|0.2|4.6|16.5| +||12 |169|895|94.7|3.5|1.8|0.3|5.6|18.9| +||clean |138|739|97.4|1.5|1.1|0.4|3.0|13.0| +||average |177|943|95.8|1.9|2.3|0.4|4.7|16.9| diff --git a/egs/lrs/avsr1/cmd.sh b/egs/lrs/avsr1/cmd.sh new file mode 100755 index 00000000000..4d70c9c7a79 --- /dev/null +++ b/egs/lrs/avsr1/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time