Merge branch 'espnet:master' into master

chintu619 · Mar 2, 2022 · 5c5eb02 · 5c5eb02
2 parents bd8e400 + 9863980
commit 5c5eb02
Show file tree

Hide file tree

Showing 8 changed files with 270 additions and 84 deletions.
diff --git a/egs2/README.md b/egs2/README.md
diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh
@@ -702,8 +702,12 @@ if ! "${skip_train}"; then
             #     CASE 1: AR model training     #
             #####################################
             _scp=wav.scp
-            # "sound" supports "wav", "flac", etc.
-            _type=sound
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
             _fold_length="$((speech_fold_length * n_shift))"
             _opts+="--feats_extract ${feats_extract} "
             _opts+="--feats_extract_conf n_fft=${n_fft} "
@@ -780,7 +784,12 @@ if ! "${skip_train}"; then
             else
                 # Teacher forcing case: use groundtruth as the target
                 _scp=wav.scp
-                _type=sound
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    # "sound" supports "wav", "flac", etc.
+                    _type=sound
+                fi
                 _fold_length="$((speech_fold_length * n_shift))"
                 _opts+="--feats_extract ${feats_extract} "
                 _opts+="--feats_extract_conf n_fft=${n_fft} "

diff --git a/egs2/iemocap/asr1/README.md b/egs2/iemocap/asr1/README.md
@@ -33,3 +33,35 @@
 |decode_asr_asr_model_valid.acc.ave_10best/test|941|11017|75.7|15.1|9.2|5.6|29.9|76.1|
 |decode_asr_asr_model_valid.acc.ave_10best/valid|390|4355|82.8|9.4|7.9|3.3|20.5|58.5|
 
+# Sentiment Analysis RESULTS
+## Environments
+- date: `Thu Feb 17 11:25:22 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `f6cde1c419c814a14ccd40abe557a780508cbcdf`
+  - Commit date: `Fri Feb 11 12:25:33 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|53.9|65.7|66.4|
+|decode_asr_model_valid.acc.ave_10best/test|1650|50.3|54.5|55.7|
+
+## Using Conformer based encoder, Transformer based decoder, and self-supervised learning features with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_hubert.yaml](conf/tuning/train_asr_conformer_hubert.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer_hubert
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|66.5|76.4|75.7|
+|decode_asr_model_valid.acc.ave_10best/test|1650|62.0|65.5|65.8|
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,60 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 200
+batch_size: 64
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py
@@ -426,11 +426,11 @@ def _calc_asr_att_loss(
         )
 
         # Compute cer/wer using attention-decoder
-        if self.training or self.error_calculator is None:
+        if self.training or self.asr_error_calculator is None:
             cer_att, wer_att = None, None
         else:
             ys_hat = decoder_out.argmax(dim=-1)
-            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+            cer_att, wer_att = self.asr_error_calculator(ys_hat.cpu(), ys_pad.cpu())
 
         return loss_att, acc_att, cer_att, wer_att
 

diff --git a/setup.py b/setup.py
@@ -71,7 +71,7 @@
         "torch_optimizer",
         "fairscale",
         "transformers",
-        "gtn",
+        "gtn==0.0.0",
     ],
     "setup": [
         "numpy<=1.21.3",

diff --git a/tools/installers/install_gtn.sh b/tools/installers/install_gtn.sh
@@ -11,7 +11,7 @@ fi
 if [ ! -e gtn.done ]; then
     (
         set -euo pipefail
-        python3 -m pip install gtn
+        python3 -m pip install gtn==0.0.0
     )
     touch gtn.done
 else