diff --git a/README.md b/README.md index b02a8b9212d..7bae1d28434 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also fo - Support numbers of `ASR` recipes (WSJ, Switchboard, CHiME-4/5, Librispeech, TED, CSJ, AMI, HKUST, Voxforge, REVERB, etc.) - Support numbers of `TTS` recipes with a similar manner to the ASR recipe (LJSpeech, LibriTTS, M-AILABS, etc.) - Support numbers of `ST` recipes (Fisher-CallHome Spanish, Libri-trans, IWSLT'18, How2, Must-C, Mboshi-French, etc.) -- Support numbers of `MT` recipes (IWSLT'16, the above ST recipes etc.) +- Support numbers of `MT` recipes (IWSLT'14, IWSLT'16, the above ST recipes etc.) - Support numbers of `SLU` recipes (CATSLU-MAPS, FSC, Grabo, IEMOCAP, JDCINAL, SNIPS, SLURP, SWBD-DA, etc.) - Support numbers of `SE/SS` recipes (DNS-IS2020, LibriMix, SMS-WSJ, VCTK-noisyreverb, WHAM!, WHAMR!, WSJ-2mix, etc.) - Support voice conversion recipe (VCC2020 baseline) @@ -368,6 +368,7 @@ Available pretrained models in the demo script are listed as below. | Must-C tst-COMMON (En->De) | 27.63 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu) | | IWSLT'14 test2014 (En->De) | 24.70 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | | IWSLT'14 test2014 (De->En) | 29.22 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | +| IWSLT'14 test2014 (De->En) | 32.2 | [link](https://github.com/espnet/espnet/blob/master/egs2/iwslt14/mt1/README.md) | | IWSLT'16 test2014 (En->De) | 24.05 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | | IWSLT'16 test2014 (De->En) | 29.13 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) | diff --git a/egs2/README.md b/egs2/README.md index d03884f2ceb..dcbd80bf5b9 100755 --- a/egs2/README.md +++ b/egs2/README.md @@ -75,6 +75,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2 | su_openslr36 | Sundanese | ASR | SUN | http://www.openslr.org/36 | | | swbd | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h) | ASR | ENG | https://catalog.ldc.upenn.edu/LDC97S62 | | | swbd_da | NXT Switchboard Annotations | SLU | ENG | https://catalog.ldc.upenn.edu/LDC2009T26 | | +| swbd_sentiment | Speech Sentiment Annotations | SLU | ENG | https://catalog.ldc.upenn.edu/LDC2020T14 | | | tedlium2 | TED-LIUM corpus release 2 | ASR | ENG | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf | | | thchs30 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University | TTS | CMN | https://www.openslr.org/18/ | | | timit | TIMIT Acoustic-Phonetic Continuous Speech Corpus | ASR | ENG | https://catalog.ldc.upenn.edu/LDC93S1 | | diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh index cbfaf059d41..6164c155558 100755 --- a/egs2/TEMPLATE/mt1/mt.sh +++ b/egs2/TEMPLATE/mt1/mt.sh @@ -460,7 +460,7 @@ if ! "${skip_data_prep}"; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ "${feats_type}" = raw ]; then - log "Stage 3: data/ -> ${data_feats}" + log "Stage 2: data/ -> ${data_feats}" for dset in "${train_set}" "${valid_set}" ${test_sets}; do if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then @@ -508,19 +508,18 @@ if ! "${skip_data_prep}"; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Then generate src lang if "${token_joint}"; then log "Merge src and target data if joint BPE" cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang} - [ -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang} + [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang} # Set the new text as the target text tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}" fi # First generate tgt lang if [ "${tgt_token_type}" = bpe ]; then - log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang" + log "Stage 4a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang" mkdir -p "${tgt_bpedir}" # shellcheck disable=SC2002 @@ -550,7 +549,7 @@ if ! "${skip_data_prep}"; then } > "${tgt_token_list}" elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then - log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text} for tgt_lang" + log "Stage 4a: Generate character level token_list from ${tgt_bpe_train_text} for tgt_lang" _opts="--non_linguistic_symbols ${nlsyms_txt}" @@ -593,10 +592,10 @@ if ! "${skip_data_prep}"; then # Then generate src lang if "${token_joint}"; then - log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true" + log "Stage 4b: Skip separate token construction for src_lang when setting ${token_joint} as true" else if [ "${src_token_type}" = bpe ]; then - log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang" + log "Stage 4b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang" mkdir -p "${src_bpedir}" # shellcheck disable=SC2002 @@ -626,7 +625,7 @@ if ! "${skip_data_prep}"; then } > "${src_token_list}" elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then - log "Stage 5b: Generate character level token_list from ${src_bpe_train_text} for src_lang" + log "Stage 4b: Generate character level token_list from ${src_bpe_train_text} for src_lang" _opts="--non_linguistic_symbols ${nlsyms_txt}" @@ -650,8 +649,6 @@ if ! "${skip_data_prep}"; then log "Error: not supported --token_type '${src_token_type}'" exit 2 fi - - fi fi diff --git a/egs2/iwslt14/mt1/README.md b/egs2/iwslt14/mt1/README.md new file mode 100644 index 00000000000..de18c222268 --- /dev/null +++ b/egs2/iwslt14/mt1/README.md @@ -0,0 +1,14 @@ +# Results + +## mt_train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3_raw_bpe_tc10000 +- mt_config: conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml +- inference_config: conf/decode_mt.yaml + +### BLEU + +Metric: BLEU-4, detokenized case-sensitive BLEU result (single-reference) + +|dataset|bleu_score|verbose_score| +|---|---|---| +|beam5_maxlenratio1.6_penalty0.2/valid|33.3|68.4/42.9/28.9/19.8 (BP = 0.924 ratio = 0.927 hyp_len = 134328 ref_len = 144976)| +|beam5_maxlenratio1.6_penalty0.2/test|32.2|67.2/41.4/27.4/18.5 (BP = 0.933 ratio = 0.935 hyp_len = 119813 ref_len = 128122)| diff --git a/egs2/iwslt14/mt1/conf/decode_mt.yaml b/egs2/iwslt14/mt1/conf/decode_mt.yaml index 2967ee6fc0f..6570a89920d 100644 --- a/egs2/iwslt14/mt1/conf/decode_mt.yaml +++ b/egs2/iwslt14/mt1/conf/decode_mt.yaml @@ -1,5 +1,5 @@ -batch_size: 1 -beam_size: 10 -nbest: 1 +beam_size: 5 lm_weight: 0.0 - +maxlenratio: 1.6 +minlenratio: 0.0 +penalty: 0.2 diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml deleted file mode 100644 index f39b863dc70..00000000000 --- a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml +++ /dev/null @@ -1,44 +0,0 @@ -batch_type: numel -batch_bins: 15000000 -accum_grad: 8 -max_epoch: 35 -patience: none -init: none -best_model_criterion: -- - valid - - acc - - max -keep_nbest_models: 10 - -encoder: transformer -encoder_conf: - output_size: 256 - attention_heads: 4 - linear_units: 2048 - num_blocks: 12 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: null - normalize_before: true - -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -model_conf: - lsm_weight: 0.1 - length_normalized_loss: false - -optim: adam -optim_conf: - lr: 0.0005 -scheduler: warmuplr -scheduler_conf: - warmup_steps: 25000 diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml new file mode 120000 index 00000000000..050cda0e4d0 --- /dev/null +++ b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml @@ -0,0 +1 @@ +tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml \ No newline at end of file diff --git a/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml new file mode 100644 index 00000000000..8b2d8844238 --- /dev/null +++ b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml @@ -0,0 +1,59 @@ +frontend: embed # embedding + positional encoding +frontend_conf: + embed_dim: 512 + positional_dropout_rate: 0.3 + +encoder: transformer +encoder_conf: + output_size: 512 + attention_heads: 4 + linear_units: 1024 + num_blocks: 6 + dropout_rate: 0.3 + positional_dropout_rate: 0.3 + attention_dropout_rate: 0.3 + input_layer: null + normalize_before: true + +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 1024 + num_blocks: 6 + dropout_rate: 0.3 + positional_dropout_rate: 0.3 + self_attention_dropout_rate: 0.3 + src_attention_dropout_rate: 0.3 + +model_conf: + lsm_weight: 0.1 + length_normalized_loss: false + share_decoder_input_output_embed: false + share_encoder_decoder_input_embed: true + +num_att_plot: 1 +log_interval: 100 +num_workers: 2 +batch_type: numel +batch_bins: 400000000 +accum_grad: 1 +max_epoch: 200 +patience: none +init: none +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 + +optim: adam +optim_conf: + lr: 0.003 + betas: + - 0.9 + - 0.98 + eps: 0.000000001 + weight_decay: 0.0001 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 10000 diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh index 06c09e91751..b8567d2709a 100755 --- a/egs2/iwslt14/mt1/run.sh +++ b/egs2/iwslt14/mt1/run.sh @@ -10,13 +10,13 @@ tgt_lang=en train_set=train train_dev=valid -test_set="test" +test_set="test valid" mt_config=conf/train_mt_transformer.yaml inference_config=conf/decode_mt.yaml src_nbpe=1000 -tgt_nbpe=1000 +tgt_nbpe=10000 # if token_joint is True, then only tgt_nbpe is used # tc: truecase # lc: lowercase @@ -27,12 +27,11 @@ tgt_case=tc ./mt.sh \ --ignore_init_mismatch true \ - --stage 1 \ - --stop_stage 13 \ --use_lm false \ - --token_joint false \ - --nj 20 \ - --inference_nj 20 \ + --token_joint true \ + --ngpu 1 \ + --nj 16 \ + --inference_nj 32 \ --src_lang ${src_lang} \ --tgt_lang ${tgt_lang} \ --src_token_type "bpe" \ @@ -49,4 +48,4 @@ tgt_case=tc --test_sets "${test_set}" \ --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \ --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \ - --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@" + --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@" diff --git a/egs2/swbd_sentiment/asr1/README.md b/egs2/swbd_sentiment/asr1/README.md new file mode 100644 index 00000000000..84ee7efbbf1 --- /dev/null +++ b/egs2/swbd_sentiment/asr1/README.md @@ -0,0 +1,35 @@ +# RESULTS +## Dataset +- Speech Sentiment Annotations (Switchboard Sentiment) + - Data: https://catalog.ldc.upenn.edu/LDC2020T14 + - Paper: https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf + +## Environments +- date: `Thu Mar 3 21:34:18 EST 2022` +- python version: `3.7.11 (default, Jul 27 2021, 14:32:16) [GCC 7.5.0]` +- espnet version: `espnet 0.10.7a1` +- pytorch version: `pytorch 1.9.0+cu102` +- Git hash: `3b53aedc654fd30a828689c2139a1e130adac077` + - Commit date: `Fri Feb 25 00:13:16 2022 -0500` + +## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment +- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml) +- token_type: word +- labels: Positive, Neutral, Negative +- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer + +|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)| +|---|---|---|---|---| +|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|61.0|65.0|65.6| +|decode_asr_asr_model_valid.acc.ave_10best/test|2438|61.4|64.4|64.6| + +## Using Conformer based encoder, Transformer based decoder and self-supervised learning features (Wav2vec2.0) with spectral augmentation and predicting transcript along with sentiment +- ASR config: [conf/tuning/train_asr_conformer_wav2vec2.yaml](conf/tuning/train_asr_conformer_wav2vec2.yaml) +- token_type: word +- labels: Positive, Neutral, Negative +- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer_wav2vec2 + +|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)| +|---|---|---|---|---| +|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|64.5|67.5|67.4| +|decode_asr_asr_model_valid.acc.ave_10best/test|2438|64.1|66.5|66.3| \ No newline at end of file diff --git a/egs2/swbd_sentiment/asr1/asr.sh b/egs2/swbd_sentiment/asr1/asr.sh new file mode 120000 index 00000000000..60b05122cfd --- /dev/null +++ b/egs2/swbd_sentiment/asr1/asr.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/asr.sh \ No newline at end of file diff --git a/egs2/swbd_sentiment/asr1/cmd.sh b/egs2/swbd_sentiment/asr1/cmd.sh new file mode 100644 index 00000000000..2aae6919fef --- /dev/null +++ b/egs2/swbd_sentiment/asr1/cmd.sh @@ -0,0 +1,110 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time