From 3afd910c2278ffd99009e2308491b66474078e5a Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Wed, 2 Mar 2022 16:47:52 -0500 Subject: [PATCH 1/7] add stochastic depth rate in the conformer encoder --- espnet2/asr/encoder/conformer_encoder.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/espnet2/asr/encoder/conformer_encoder.py b/espnet2/asr/encoder/conformer_encoder.py index 00f09070cda..f0a3e5bb501 100644 --- a/espnet2/asr/encoder/conformer_encoder.py +++ b/espnet2/asr/encoder/conformer_encoder.py @@ -6,6 +6,7 @@ from typing import List from typing import Optional from typing import Tuple +from typing import Union import logging import torch @@ -105,6 +106,7 @@ def __init__( padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, + stochastic_depth_rate: Union[float, List[float]] = 0.0, ): assert check_argument_types() super().__init__() @@ -250,6 +252,15 @@ def __init__( convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) + if isinstance(stochastic_depth_rate, float): + stochastic_depth_rate = [stochastic_depth_rate] * num_blocks + + if len(stochastic_depth_rate) != num_blocks: + raise ValueError( + f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) " + f"should be equal to num_blocks ({num_blocks})" + ) + self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( @@ -261,6 +272,7 @@ def __init__( dropout_rate, normalize_before, concat_after, + stochastic_depth_rate[lnum] ), ) if self.normalize_before: From bdb171623be6f0ab1cd26526306eff376fe3717e Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Sun, 6 Mar 2022 15:25:15 -0500 Subject: [PATCH 2/7] update config --- egs2/librispeech/asr1/conf/decode_asr.yaml | 4 +- .../conf/tuning/train_asr_conformer8.yaml | 76 ++++++++++++++++ ...ain_asr_conformer9_layerdrop0.1_last6.yaml | 90 +++++++++++++++++++ 3 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml create mode 100644 egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml diff --git a/egs2/librispeech/asr1/conf/decode_asr.yaml b/egs2/librispeech/asr1/conf/decode_asr.yaml index fe5290e82d1..f89d2168fd1 100644 --- a/egs2/librispeech/asr1/conf/decode_asr.yaml +++ b/egs2/librispeech/asr1/conf/decode_asr.yaml @@ -1,3 +1,3 @@ -lm_weight: 0.6 -ctc_weight: 0.4 beam_size: 60 +ctc_weight: 0.2 +lm_weight: 0. diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml new file mode 100644 index 00000000000..5ff37537086 --- /dev/null +++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml @@ -0,0 +1,76 @@ +# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days. +encoder: conformer +encoder_conf: + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + normalize_before: true + macaron_style: true + rel_pos_type: latest + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + activation_type: swish + use_cnn_module: true + cnn_module_kernel: 31 + +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false + +frontend_conf: + n_fft: 512 + hop_length: 256 + +use_amp: true +num_workers: 4 +batch_type: numel +batch_bins: 35000000 +accum_grad: 4 +max_epoch: 50 +patience: none +init: none +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 + +optim: adam +optim_conf: + lr: 0.0025 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 40000 + +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 10 diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml new file mode 100644 index 00000000000..bb89c68826b --- /dev/null +++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml @@ -0,0 +1,90 @@ +# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days. +encoder: conformer +encoder_conf: + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + normalize_before: true + macaron_style: true + rel_pos_type: latest + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + activation_type: swish + use_cnn_module: true + cnn_module_kernel: 31 + stochastic_depth_rate: + - 0.0 + - 0.0 + - 0.0 + - 0.0 + - 0.0 + - 0.0 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + - 0.1 + +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 + +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 + length_normalized_loss: false + +frontend_conf: + n_fft: 512 + hop_length: 256 + +unused_parameters: true # due to layer dropout, some layers are not used +use_amp: true +num_workers: 4 +batch_type: numel +batch_bins: 35000000 +accum_grad: 4 +max_epoch: 50 +patience: none +init: none +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 + +optim: adam +optim_conf: + lr: 0.0025 + weight_decay: 0.000001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 40000 + +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 27 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_ratio_range: + - 0. + - 0.05 + num_time_mask: 10 From 53b5c17a3f88718cfc57f4814b0c22dbf4e73287 Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Sun, 6 Mar 2022 20:25:50 -0500 Subject: [PATCH 3/7] apply black to conformer_encoder.py --- espnet2/asr/encoder/conformer_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espnet2/asr/encoder/conformer_encoder.py b/espnet2/asr/encoder/conformer_encoder.py index f0a3e5bb501..c0c3d92fd1c 100644 --- a/espnet2/asr/encoder/conformer_encoder.py +++ b/espnet2/asr/encoder/conformer_encoder.py @@ -272,7 +272,7 @@ def __init__( dropout_rate, normalize_before, concat_after, - stochastic_depth_rate[lnum] + stochastic_depth_rate[lnum], ), ) if self.normalize_before: From 328549538602866f6e7dacaf376f34963243feb9 Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Sun, 6 Mar 2022 23:41:09 -0500 Subject: [PATCH 4/7] update test --- egs2/librispeech/asr1/README.md | 14 ++++++++++++++ .../asr/encoder/test_conformer_encoder.py | 19 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md index 64a859f07b7..365f10af002 100644 --- a/egs2/librispeech/asr1/README.md +++ b/egs2/librispeech/asr1/README.md @@ -112,6 +112,20 @@ |decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|97.7|1.6|0.7|0.4|2.7|25.7| |decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|94.5|3.9|1.5|1.0|6.4|45.1| + +# Conformer, using stochastic depth +- ASR config [conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml](conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml) +- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml) +- Pretrained model: []() + + +# Conformer, new SpecAug, using weight decay in Adam +- ASR config [conf/tuning/train_asr_conformer8.yaml](conf/tuning/train_asr_conformer8.yaml) +- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml) +- Pretrained model: []() + + + # Tuning warmup_steps - Note - warmup_steps: 25000 -> 40000 diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py index ddc2d077f9d..212785c3a56 100644 --- a/test/espnet2/asr/encoder/test_conformer_encoder.py +++ b/test/espnet2/asr/encoder/test_conformer_encoder.py @@ -26,6 +26,9 @@ ([1], True), ], ) +@pytest.mark.parametrize( + "stochastic_depth_rate", [0.0, 0.1, [0.1, 0.1]] +) def test_encoder_forward_backward( input_layer, positionwise_layer_type, @@ -34,6 +37,7 @@ def test_encoder_forward_backward( selfattention_layer_type, interctc_layer_idx, interctc_use_conditioning, + stochastic_depth_rate, ): encoder = ConformerEncoder( 20, @@ -52,6 +56,7 @@ def test_encoder_forward_backward( positionwise_layer_type=positionwise_layer_type, interctc_layer_idx=interctc_layer_idx, interctc_use_conditioning=interctc_use_conditioning, + stochastic_depth_rate=stochastic_depth_rate, ) if input_layer == "embed": x = torch.randint(0, 10, [2, 32]) @@ -128,3 +133,17 @@ def test_encoder_output_size(): def test_encoder_invalid_type(): with pytest.raises(ValueError): ConformerEncoder(20, input_layer="fff") + +def test_encoder_invalid_stochastic_depth_rate(): + with pytest.raises(ValueError): + ConformerEncoder( + 20, + num_blocks=2, + stochastic_depth_rate=[0.1], + ) + with pytest.raises(ValueError): + ConformerEncoder( + 20, + num_blocks=2, + stochastic_depth_rate=[0.1, 0.1, 0.1], + ) From c3569453a408fd4ff4173d9c1d2062c88d1fc060 Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Sun, 6 Mar 2022 23:58:36 -0500 Subject: [PATCH 5/7] apply black --- test/espnet2/asr/encoder/test_conformer_encoder.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py index 212785c3a56..9acb0cd1b8a 100644 --- a/test/espnet2/asr/encoder/test_conformer_encoder.py +++ b/test/espnet2/asr/encoder/test_conformer_encoder.py @@ -26,9 +26,7 @@ ([1], True), ], ) -@pytest.mark.parametrize( - "stochastic_depth_rate", [0.0, 0.1, [0.1, 0.1]] -) +@pytest.mark.parametrize("stochastic_depth_rate", [0.0, 0.1, [0.1, 0.1]]) def test_encoder_forward_backward( input_layer, positionwise_layer_type, @@ -134,6 +132,7 @@ def test_encoder_invalid_type(): with pytest.raises(ValueError): ConformerEncoder(20, input_layer="fff") + def test_encoder_invalid_stochastic_depth_rate(): with pytest.raises(ValueError): ConformerEncoder( From 7c9f39a7f48a36e8b270a19ede8105cd8205d259 Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Mon, 7 Mar 2022 13:45:59 -0500 Subject: [PATCH 6/7] update results and model links --- egs2/librispeech/asr1/README.md | 154 +++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 2 deletions(-) diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md index 365f10af002..73cf830ea2e 100644 --- a/egs2/librispeech/asr1/README.md +++ b/egs2/librispeech/asr1/README.md @@ -113,16 +113,166 @@ |decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|94.5|3.9|1.5|1.0|6.4|45.1| + # Conformer, using stochastic depth +- Params: 116.15M - ASR config [conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml](conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml) - LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml) -- Pretrained model: []() +- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6](https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6) + +# RESULTS +## Environments +- date: `Mon Mar 7 12:21:40 EST 2022` +- python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]` +- espnet version: `espnet 0.10.7a1` +- pytorch version: `pytorch 1.10.1` +- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060` + - Commit date: `Sun Mar 6 23:58:36 2022 -0500` + +## asr_train_asr_conformer9_layerdrop0.1_last6_raw_en_bpe5000_sp +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|beam60_ctc0.2/dev_clean|2703|54402|98.0|1.8|0.2|0.2|2.2|26.5| +|beam60_ctc0.2/dev_other|2864|50948|95.4|4.2|0.4|0.5|5.1|43.3| +|beam60_ctc0.2/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|27.9| +|beam60_ctc0.2/test_other|2939|52343|95.2|4.3|0.5|0.6|5.4|45.4| +|beam60_ctc0.2_lm0.6/dev_clean|2703|54402|98.2|1.5|0.3|0.2|2.0|23.7| +|beam60_ctc0.2_lm0.6/dev_other|2864|50948|96.3|3.2|0.5|0.4|4.1|36.5| +|beam60_ctc0.2_lm0.6/test_clean|2620|52576|98.1|1.6|0.3|0.2|2.1|24.0| +|beam60_ctc0.2_lm0.6/test_other|2939|52343|96.0|3.4|0.6|0.5|4.4|40.5| +|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|26.6| +|beam60_ctc0.3/dev_other|2864|50948|95.4|4.2|0.4|0.5|5.1|43.3| +|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|28.1| +|beam60_ctc0.3/test_other|2939|52343|95.3|4.3|0.4|0.7|5.4|45.7| +|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.3| +|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.4|3.2|0.4|0.4|4.0|36.5| +|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.2|1.6|0.2|0.2|2.0|23.7| +|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.4|0.5|0.5|4.3|40.4| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|beam60_ctc0.2/dev_clean|2703|288456|99.4|0.3|0.3|0.2|0.8|26.5| +|beam60_ctc0.2/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|43.3| +|beam60_ctc0.2/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|27.9| +|beam60_ctc0.2/test_other|2939|272758|98.3|1.0|0.7|0.6|2.3|45.4| +|beam60_ctc0.2_lm0.6/dev_clean|2703|288456|99.3|0.3|0.4|0.2|0.9|23.7| +|beam60_ctc0.2_lm0.6/dev_other|2864|265951|98.4|0.9|0.8|0.5|2.1|36.5| +|beam60_ctc0.2_lm0.6/test_clean|2620|281530|99.4|0.3|0.4|0.2|0.8|24.0| +|beam60_ctc0.2_lm0.6/test_other|2939|272758|98.4|0.8|0.8|0.5|2.1|40.5| +|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|26.6| +|beam60_ctc0.3/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|43.3| +|beam60_ctc0.3/test_clean|2620|281530|99.5|0.3|0.3|0.2|0.8|28.1| +|beam60_ctc0.3/test_other|2939|272758|98.4|1.0|0.7|0.6|2.3|45.7| +|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.3|0.3|0.2|0.7|23.3| +|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.8|0.7|0.5|1.9|36.5| +|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.7| +|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.5|0.7|0.7|0.5|2.0|40.4| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|beam60_ctc0.2/dev_clean|2703|68010|97.5|1.8|0.7|0.4|2.8|26.5| +|beam60_ctc0.2/dev_other|2864|63110|94.2|4.3|1.5|0.8|6.6|43.3| +|beam60_ctc0.2/test_clean|2620|65818|97.4|1.8|0.8|0.3|3.0|27.9| +|beam60_ctc0.2/test_other|2939|65101|94.1|4.1|1.8|0.8|6.7|45.4| +|beam60_ctc0.2_lm0.6/dev_clean|2703|68010|97.7|1.5|0.8|0.3|2.6|23.7| +|beam60_ctc0.2_lm0.6/dev_other|2864|63110|95.1|3.4|1.5|0.6|5.5|36.5| +|beam60_ctc0.2_lm0.6/test_clean|2620|65818|97.6|1.5|0.9|0.3|2.7|24.0| +|beam60_ctc0.2_lm0.6/test_other|2939|65101|94.8|3.3|1.9|0.6|5.7|40.5| +|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.7|0.3|2.7|26.6| +|beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.8|6.6|43.3| +|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.3|2.9|28.1| +|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7| +|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.4|0.7|0.3|2.4|23.3| +|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.2|3.4|1.5|0.6|5.5|36.5| +|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.6|23.7| +|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.0|3.2|1.8|0.6|5.6|40.4| + # Conformer, new SpecAug, using weight decay in Adam +- Params: 116.15M - ASR config [conf/tuning/train_asr_conformer8.yaml](conf/tuning/train_asr_conformer8.yaml) - LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml) -- Pretrained model: []() +- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer](https://huggingface.co/pyf98/librispeech_conformer) + +# RESULTS +## Environments +- date: `Mon Mar 7 12:26:10 EST 2022` +- python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]` +- espnet version: `espnet 0.10.7a1` +- pytorch version: `pytorch 1.10.1` +- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060` + - Commit date: `Sun Mar 6 23:58:36 2022 -0500` + +## asr_train_asr_conformer8_raw_en_bpe5000_sp +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|beam60_ctc0.2/dev_clean|2703|54402|98.0|1.8|0.2|0.2|2.2|27.2| +|beam60_ctc0.2/dev_other|2864|50948|95.1|4.4|0.5|0.5|5.4|43.3| +|beam60_ctc0.2/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|28.8| +|beam60_ctc0.2/test_other|2939|52343|95.2|4.3|0.5|0.6|5.4|45.5| +|beam60_ctc0.2_lm0.6/dev_clean|2703|54402|98.3|1.4|0.3|0.2|1.9|23.7| +|beam60_ctc0.2_lm0.6/dev_other|2864|50948|96.2|3.3|0.4|0.4|4.2|37.2| +|beam60_ctc0.2_lm0.6/test_clean|2620|52576|98.2|1.5|0.3|0.2|2.0|24.3| +|beam60_ctc0.2_lm0.6/test_other|2939|52343|96.1|3.3|0.6|0.4|4.4|39.9| +|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|27.3| +|beam60_ctc0.3/dev_other|2864|50948|95.2|4.4|0.4|0.5|5.4|43.7| +|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.3|29.0| +|beam60_ctc0.3/test_other|2939|52343|95.2|4.3|0.4|0.6|5.4|45.7| +|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.5| +|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.2|3.4|0.4|0.4|4.1|37.4| +|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.3|1.5|0.2|0.2|1.9|24.1| +|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.3|0.5|0.5|4.3|39.9| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|beam60_ctc0.2/dev_clean|2703|288456|99.4|0.3|0.3|0.2|0.8|27.2| +|beam60_ctc0.2/dev_other|2864|265951|98.1|1.1|0.8|0.6|2.5|43.3| +|beam60_ctc0.2/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|28.8| +|beam60_ctc0.2/test_other|2939|272758|98.3|1.0|0.7|0.6|2.3|45.5| +|beam60_ctc0.2_lm0.6/dev_clean|2703|288456|99.4|0.3|0.3|0.2|0.8|23.7| +|beam60_ctc0.2_lm0.6/dev_other|2864|265951|98.4|0.9|0.7|0.5|2.1|37.2| +|beam60_ctc0.2_lm0.6/test_clean|2620|281530|99.4|0.2|0.4|0.2|0.8|24.3| +|beam60_ctc0.2_lm0.6/test_other|2939|272758|98.5|0.8|0.8|0.5|2.0|39.9| +|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.3| +|beam60_ctc0.3/dev_other|2864|265951|98.2|1.1|0.7|0.6|2.4|43.7| +|beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|29.0| +|beam60_ctc0.3/test_other|2939|272758|98.4|0.9|0.7|0.6|2.2|45.7| +|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.7|23.5| +|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.9|0.7|0.5|2.0|37.4| +|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|24.1| +|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.6|0.7|0.7|0.5|1.9|39.9| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|beam60_ctc0.2/dev_clean|2703|68010|97.5|1.8|0.7|0.3|2.9|27.2| +|beam60_ctc0.2/dev_other|2864|63110|94.1|4.4|1.6|0.9|6.8|43.3| +|beam60_ctc0.2/test_clean|2620|65818|97.4|1.8|0.8|0.3|2.9|28.8| +|beam60_ctc0.2/test_other|2939|65101|94.1|4.1|1.8|0.8|6.7|45.5| +|beam60_ctc0.2_lm0.6/dev_clean|2703|68010|97.8|1.4|0.8|0.3|2.5|23.7| +|beam60_ctc0.2_lm0.6/dev_other|2864|63110|95.1|3.5|1.5|0.7|5.6|37.2| +|beam60_ctc0.2_lm0.6/test_clean|2620|65818|97.6|1.5|0.9|0.3|2.7|24.3| +|beam60_ctc0.2_lm0.6/test_other|2939|65101|95.0|3.2|1.8|0.6|5.6|39.9| +|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.8|0.7|0.3|2.8|27.3| +|beam60_ctc0.3/dev_other|2864|63110|94.1|4.4|1.5|0.9|6.8|43.7| +|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.7|0.3|2.9|29.0| +|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7| +|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.5|0.7|0.3|2.4|23.5| +|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.1|3.5|1.4|0.6|5.6|37.4| +|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.5|24.1| +|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.1|3.2|1.7|0.6|5.5|39.9| From dabbc3c0158beff3d2673a630d76d92fe189a98a Mon Sep 17 00:00:00 2001 From: Yifan Peng Date: Mon, 7 Mar 2022 13:57:02 -0500 Subject: [PATCH 7/7] only keep the best results --- egs2/librispeech/asr1/README.md | 48 ---------------------- egs2/librispeech/asr1/conf/decode_asr.yaml | 4 +- egs2/librispeech/asr1/run.sh | 2 +- 3 files changed, 3 insertions(+), 51 deletions(-) diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md index 73cf830ea2e..986479a9946 100644 --- a/egs2/librispeech/asr1/README.md +++ b/egs2/librispeech/asr1/README.md @@ -134,14 +134,6 @@ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|beam60_ctc0.2/dev_clean|2703|54402|98.0|1.8|0.2|0.2|2.2|26.5| -|beam60_ctc0.2/dev_other|2864|50948|95.4|4.2|0.4|0.5|5.1|43.3| -|beam60_ctc0.2/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|27.9| -|beam60_ctc0.2/test_other|2939|52343|95.2|4.3|0.5|0.6|5.4|45.4| -|beam60_ctc0.2_lm0.6/dev_clean|2703|54402|98.2|1.5|0.3|0.2|2.0|23.7| -|beam60_ctc0.2_lm0.6/dev_other|2864|50948|96.3|3.2|0.5|0.4|4.1|36.5| -|beam60_ctc0.2_lm0.6/test_clean|2620|52576|98.1|1.6|0.3|0.2|2.1|24.0| -|beam60_ctc0.2_lm0.6/test_other|2939|52343|96.0|3.4|0.6|0.5|4.4|40.5| |beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|26.6| |beam60_ctc0.3/dev_other|2864|50948|95.4|4.2|0.4|0.5|5.1|43.3| |beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|28.1| @@ -155,14 +147,6 @@ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|beam60_ctc0.2/dev_clean|2703|288456|99.4|0.3|0.3|0.2|0.8|26.5| -|beam60_ctc0.2/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|43.3| -|beam60_ctc0.2/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|27.9| -|beam60_ctc0.2/test_other|2939|272758|98.3|1.0|0.7|0.6|2.3|45.4| -|beam60_ctc0.2_lm0.6/dev_clean|2703|288456|99.3|0.3|0.4|0.2|0.9|23.7| -|beam60_ctc0.2_lm0.6/dev_other|2864|265951|98.4|0.9|0.8|0.5|2.1|36.5| -|beam60_ctc0.2_lm0.6/test_clean|2620|281530|99.4|0.3|0.4|0.2|0.8|24.0| -|beam60_ctc0.2_lm0.6/test_other|2939|272758|98.4|0.8|0.8|0.5|2.1|40.5| |beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|26.6| |beam60_ctc0.3/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|43.3| |beam60_ctc0.3/test_clean|2620|281530|99.5|0.3|0.3|0.2|0.8|28.1| @@ -176,14 +160,6 @@ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|beam60_ctc0.2/dev_clean|2703|68010|97.5|1.8|0.7|0.4|2.8|26.5| -|beam60_ctc0.2/dev_other|2864|63110|94.2|4.3|1.5|0.8|6.6|43.3| -|beam60_ctc0.2/test_clean|2620|65818|97.4|1.8|0.8|0.3|3.0|27.9| -|beam60_ctc0.2/test_other|2939|65101|94.1|4.1|1.8|0.8|6.7|45.4| -|beam60_ctc0.2_lm0.6/dev_clean|2703|68010|97.7|1.5|0.8|0.3|2.6|23.7| -|beam60_ctc0.2_lm0.6/dev_other|2864|63110|95.1|3.4|1.5|0.6|5.5|36.5| -|beam60_ctc0.2_lm0.6/test_clean|2620|65818|97.6|1.5|0.9|0.3|2.7|24.0| -|beam60_ctc0.2_lm0.6/test_other|2939|65101|94.8|3.3|1.9|0.6|5.7|40.5| |beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.7|0.3|2.7|26.6| |beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.8|6.6|43.3| |beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.3|2.9|28.1| @@ -215,14 +191,6 @@ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|beam60_ctc0.2/dev_clean|2703|54402|98.0|1.8|0.2|0.2|2.2|27.2| -|beam60_ctc0.2/dev_other|2864|50948|95.1|4.4|0.5|0.5|5.4|43.3| -|beam60_ctc0.2/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|28.8| -|beam60_ctc0.2/test_other|2939|52343|95.2|4.3|0.5|0.6|5.4|45.5| -|beam60_ctc0.2_lm0.6/dev_clean|2703|54402|98.3|1.4|0.3|0.2|1.9|23.7| -|beam60_ctc0.2_lm0.6/dev_other|2864|50948|96.2|3.3|0.4|0.4|4.2|37.2| -|beam60_ctc0.2_lm0.6/test_clean|2620|52576|98.2|1.5|0.3|0.2|2.0|24.3| -|beam60_ctc0.2_lm0.6/test_other|2939|52343|96.1|3.3|0.6|0.4|4.4|39.9| |beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|27.3| |beam60_ctc0.3/dev_other|2864|50948|95.2|4.4|0.4|0.5|5.4|43.7| |beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.3|29.0| @@ -236,14 +204,6 @@ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|beam60_ctc0.2/dev_clean|2703|288456|99.4|0.3|0.3|0.2|0.8|27.2| -|beam60_ctc0.2/dev_other|2864|265951|98.1|1.1|0.8|0.6|2.5|43.3| -|beam60_ctc0.2/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|28.8| -|beam60_ctc0.2/test_other|2939|272758|98.3|1.0|0.7|0.6|2.3|45.5| -|beam60_ctc0.2_lm0.6/dev_clean|2703|288456|99.4|0.3|0.3|0.2|0.8|23.7| -|beam60_ctc0.2_lm0.6/dev_other|2864|265951|98.4|0.9|0.7|0.5|2.1|37.2| -|beam60_ctc0.2_lm0.6/test_clean|2620|281530|99.4|0.2|0.4|0.2|0.8|24.3| -|beam60_ctc0.2_lm0.6/test_other|2939|272758|98.5|0.8|0.8|0.5|2.0|39.9| |beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.3| |beam60_ctc0.3/dev_other|2864|265951|98.2|1.1|0.7|0.6|2.4|43.7| |beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|29.0| @@ -257,14 +217,6 @@ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| |---|---|---|---|---|---|---|---|---| -|beam60_ctc0.2/dev_clean|2703|68010|97.5|1.8|0.7|0.3|2.9|27.2| -|beam60_ctc0.2/dev_other|2864|63110|94.1|4.4|1.6|0.9|6.8|43.3| -|beam60_ctc0.2/test_clean|2620|65818|97.4|1.8|0.8|0.3|2.9|28.8| -|beam60_ctc0.2/test_other|2939|65101|94.1|4.1|1.8|0.8|6.7|45.5| -|beam60_ctc0.2_lm0.6/dev_clean|2703|68010|97.8|1.4|0.8|0.3|2.5|23.7| -|beam60_ctc0.2_lm0.6/dev_other|2864|63110|95.1|3.5|1.5|0.7|5.6|37.2| -|beam60_ctc0.2_lm0.6/test_clean|2620|65818|97.6|1.5|0.9|0.3|2.7|24.3| -|beam60_ctc0.2_lm0.6/test_other|2939|65101|95.0|3.2|1.8|0.6|5.6|39.9| |beam60_ctc0.3/dev_clean|2703|68010|97.6|1.8|0.7|0.3|2.8|27.3| |beam60_ctc0.3/dev_other|2864|63110|94.1|4.4|1.5|0.9|6.8|43.7| |beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.7|0.3|2.9|29.0| diff --git a/egs2/librispeech/asr1/conf/decode_asr.yaml b/egs2/librispeech/asr1/conf/decode_asr.yaml index f89d2168fd1..7b44351b5f2 100644 --- a/egs2/librispeech/asr1/conf/decode_asr.yaml +++ b/egs2/librispeech/asr1/conf/decode_asr.yaml @@ -1,3 +1,3 @@ beam_size: 60 -ctc_weight: 0.2 -lm_weight: 0. +ctc_weight: 0.3 +lm_weight: 0.6 diff --git a/egs2/librispeech/asr1/run.sh b/egs2/librispeech/asr1/run.sh index 7035051a859..8ca7155d69d 100755 --- a/egs2/librispeech/asr1/run.sh +++ b/egs2/librispeech/asr1/run.sh @@ -9,7 +9,7 @@ train_set="train_960" valid_set="dev" test_sets="test_clean test_other dev_clean dev_other" -asr_config=conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml +asr_config=conf/tuning/train_asr_conformer8.yaml lm_config=conf/tuning/train_lm_transformer2.yaml inference_config=conf/decode_asr.yaml