Merge pull request espnet#4100 from YushiUeda/iemocap

Add IEMOCAP results and configs
chintu619 · Mar 2, 2022 · 9863980 · 9863980
2 parents 2830b73 + cf73065
commit 9863980
Show file tree

Hide file tree

Showing 3 changed files with 177 additions and 0 deletions.
diff --git a/egs2/iemocap/asr1/README.md b/egs2/iemocap/asr1/README.md
@@ -33,3 +33,35 @@
 |decode_asr_asr_model_valid.acc.ave_10best/test|941|11017|75.7|15.1|9.2|5.6|29.9|76.1|
 |decode_asr_asr_model_valid.acc.ave_10best/valid|390|4355|82.8|9.4|7.9|3.3|20.5|58.5|
 
+# Sentiment Analysis RESULTS
+## Environments
+- date: `Thu Feb 17 11:25:22 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `f6cde1c419c814a14ccd40abe557a780508cbcdf`
+  - Commit date: `Fri Feb 11 12:25:33 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|53.9|65.7|66.4|
+|decode_asr_model_valid.acc.ave_10best/test|1650|50.3|54.5|55.7|
+
+## Using Conformer based encoder, Transformer based decoder, and self-supervised learning features with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_hubert.yaml](conf/tuning/train_asr_conformer_hubert.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer_hubert
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|66.5|76.4|75.7|
+|decode_asr_model_valid.acc.ave_10best/test|1650|62.0|65.5|65.8|
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,60 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 200
+batch_size: 64
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10