Add experiment result in egs2/wsj0_2mix/enh1/README.md; Update code i…

…n some files
chintu619 · Mar 7, 2022 · 5f86c11 · 5f86c11
2 parents 7aa90b5 + 6f42960
commit 5f86c11
Show file tree

Hide file tree

Showing 59 changed files with 2,087 additions and 221 deletions.
diff --git a/README.md b/README.md
diff --git a/egs2/README.md b/egs2/README.md
@@ -75,6 +75,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | su_openslr36            | Sundanese                                                                               | ASR                     | SUN                  | http://www.openslr.org/36                                                                                    |              |
 | swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                 | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |
 | swbd_da                 | NXT Switchboard Annotations                                                             | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2009T26                                                                     |              |
+| swbd_sentiment          | Speech Sentiment Annotations                                                            | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2020T14                                                                    |              |
 | tedlium2                | TED-LIUM corpus release 2                                                               | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
 | thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                       | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
 | timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |

diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
@@ -460,7 +460,7 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         if [ "${feats_type}" = raw ]; then
-            log "Stage 3: data/ -> ${data_feats}"
+            log "Stage 2: data/ -> ${data_feats}"
 
             for dset in "${train_set}" "${valid_set}" ${test_sets}; do
                 if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
@@ -508,19 +508,18 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
 
-        # Then generate src lang
         if "${token_joint}"; then
             log "Merge src and target data if joint BPE"
 
             cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
-            [ -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >>  ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
             # Set the new text as the target text
             tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
         fi
 
         # First generate tgt lang
         if [ "${tgt_token_type}" = bpe ]; then
-            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+            log "Stage 4a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
 
             mkdir -p "${tgt_bpedir}"
             # shellcheck disable=SC2002
@@ -550,7 +549,7 @@ if ! "${skip_data_prep}"; then
             } > "${tgt_token_list}"
 
         elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
-            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+            log "Stage 4a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
 
             _opts="--non_linguistic_symbols ${nlsyms_txt}"
 
@@ -593,10 +592,10 @@ if ! "${skip_data_prep}"; then
 
         # Then generate src lang
         if "${token_joint}"; then
-            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+            log "Stage 4b: Skip separate token construction for src_lang when setting ${token_joint} as true"
         else
             if [ "${src_token_type}" = bpe ]; then
-                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+                log "Stage 4b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
 
                 mkdir -p "${src_bpedir}"
                 # shellcheck disable=SC2002
@@ -626,7 +625,7 @@ if ! "${skip_data_prep}"; then
                 } > "${src_token_list}"
 
             elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
-                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+                log "Stage 4b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
 
                 _opts="--non_linguistic_symbols ${nlsyms_txt}"
 
@@ -650,8 +649,6 @@ if ! "${skip_data_prep}"; then
                 log "Error: not supported --token_type '${src_token_type}'"
                 exit 2
             fi
-
-
         fi
     fi
 

diff --git a/egs2/dns_ins20/enh1/README.md b/egs2/dns_ins20/enh1/README.md
@@ -14,9 +14,33 @@
  - config: ./conf/tuning/train_enh_blstm_tf.yaml
  - Pretrained model: https://zenodo.org/record/4923697
 
-|dataset|STOI|SAR|SDR|SIR|
-|---|---|---|---|---|
-|enhanced_cv_synthetic|0.95|18.63|18.63|0.00|
-|enhanced_tt_synthetic_no_reverb|0.92|10.92|10.92|0.00|
-|enhanced_tt_synthetic_with_reverb|0.85|9.31|9.31|0.00|
+| dataset                           | STOI | SAR   | SDR   | SIR  |
+| --------------------------------- | ---- | ----- | ----- | ---- |
+| enhanced_cv_synthetic             | 0.95 | 18.63 | 18.63 | 0.00 |
+| enhanced_tt_synthetic_no_reverb   | 0.92 | 10.92 | 10.92 | 0.00 |
+| enhanced_tt_synthetic_with_reverb | 0.85 | 9.31  | 9.31  | 0.00 |
 
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Feb 10 23:11:40 CST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.1`
+- Git hash: `6f66283b9eed7b0d5e5643feb18d8f60118a4afc`
+  - Commit date: `Mon Dec 13 15:30:29 2021 +0800`
+
+
+## enh_train_enh_dccrn_raw
+
+- config: ./conf/tuning/train_enh_dccrn.yaml
+- download_model: https://huggingface.co/Johnson-Lsx/Shaoxiong_Lin_dns_ins20_enh_enh_train_enh_dccrn_raw
+
+| dataset                           | PESQ | STOI | SAR   | SDR   | SIR  | SI_SNR |
+| --------------------------------- | ---- | ---- | ----- | ----- | ---- | ------ |
+| enhanced_cv_synthetic             | 3.72 | 0.98 | 24.69 | 24.69 | 0.00 | 24.22  |
+| enhanced_tt_synthetic_no_reverb   | 3.29 | 0.96 | 17.69 | 17.69 | 0.00 | 17.50  |
+| enhanced_tt_synthetic_with_reverb | 2.54 | 0.81 | 10.45 | 10.45 | 0.00 | 9.72   |
+
+Note: Here, the model is only trained on data without reverberation.
+Note: Here, the PESQ score is calculated based on https://github.com/vBaiCai/python-pesq.
diff --git a/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml b/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml
@@ -0,0 +1,53 @@
+optim: adam
+init: null   # do not set init method here because DCCRN has its own initialization
+max_epoch: 100
+batch_type: folded
+batch_size: 32
+iterator_type: chunk
+chunk_length: 64000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+model_conf:
+    loss_type: si_snr
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 100
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 100
+separator: dccrn
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/iwslt14/mt1/README.md b/egs2/iwslt14/mt1/README.md
@@ -0,0 +1,14 @@
+# Results
+
+## mt_train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3_raw_bpe_tc10000
+- mt_config: conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
+- inference_config: conf/decode_mt.yaml
+
+### BLEU
+
+Metric: BLEU-4, detokenized case-sensitive BLEU result (single-reference)
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+|beam5_maxlenratio1.6_penalty0.2/valid|33.3|68.4/42.9/28.9/19.8 (BP = 0.924 ratio = 0.927 hyp_len = 134328 ref_len = 144976)|
+|beam5_maxlenratio1.6_penalty0.2/test|32.2|67.2/41.4/27.4/18.5 (BP = 0.933 ratio = 0.935 hyp_len = 119813 ref_len = 128122)|
diff --git a/egs2/iwslt14/mt1/conf/decode_mt.yaml b/egs2/iwslt14/mt1/conf/decode_mt.yaml
@@ -1,5 +1,5 @@
-batch_size: 1
-beam_size: 10
-nbest: 1
+beam_size: 5
 lm_weight: 0.0
-
+maxlenratio: 1.6
+minlenratio: 0.0
+penalty: 0.2
diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
diff --git a/...mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml b/...mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
@@ -0,0 +1,59 @@
+frontend: embed     # embedding + positional encoding
+frontend_conf:
+    embed_dim: 512
+    positional_dropout_rate: 0.3
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: null
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    share_decoder_input_output_embed: false
+    share_encoder_decoder_input_embed: true
+
+num_att_plot: 1
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 400000000
+accum_grad: 1
+max_epoch: 200
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.003
+    betas:
+    - 0.9
+    - 0.98
+    eps: 0.000000001
+    weight_decay: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
@@ -10,13 +10,13 @@ tgt_lang=en
 
 train_set=train
 train_dev=valid
-test_set="test"
+test_set="test valid"
 
 mt_config=conf/train_mt_transformer.yaml
 inference_config=conf/decode_mt.yaml
 
 src_nbpe=1000
-tgt_nbpe=1000
+tgt_nbpe=10000   # if token_joint is True, then only tgt_nbpe is used
 
 # tc: truecase
 # lc: lowercase
@@ -27,12 +27,11 @@ tgt_case=tc
 
 ./mt.sh \
     --ignore_init_mismatch true \
-    --stage 1 \
-    --stop_stage 13 \
     --use_lm false \
-    --token_joint false \
-    --nj 20 \
-    --inference_nj 20 \
+    --token_joint true \
+    --ngpu 1 \
+    --nj 16 \
+    --inference_nj 32 \
     --src_lang ${src_lang} \
     --tgt_lang ${tgt_lang} \
     --src_token_type "bpe" \
@@ -49,4 +48,4 @@ tgt_case=tc
     --test_sets "${test_set}" \
     --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
     --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
-    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@"
diff --git a/egs2/swbd_sentiment/asr1/README.md b/egs2/swbd_sentiment/asr1/README.md
@@ -0,0 +1,35 @@
+# RESULTS
+## Dataset
+- Speech Sentiment Annotations (Switchboard Sentiment)
+   - Data: https://catalog.ldc.upenn.edu/LDC2020T14
+   - Paper: https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf
+
+## Environments
+- date: `Thu Mar  3 21:34:18 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `3b53aedc654fd30a828689c2139a1e130adac077`
+  - Commit date: `Fri Feb 25 00:13:16 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|61.0|65.0|65.6|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|61.4|64.4|64.6|
+
+## Using Conformer based encoder, Transformer based decoder and self-supervised learning features (Wav2vec2.0) with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_wav2vec2.yaml](conf/tuning/train_asr_conformer_wav2vec2.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer_wav2vec2
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|64.5|67.5|67.4|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|64.1|66.5|66.3|
diff --git a/egs2/swbd_sentiment/asr1/asr.sh b/egs2/swbd_sentiment/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml