diff --git a/egs2/README.md b/egs2/README.md index dcbd80bf5b9..2b9bdbbca27 100755 --- a/egs2/README.md +++ b/egs2/README.md @@ -8,39 +8,40 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2 | Directory name | Corpus name | Task | Language | URL | Note | | ----------------------- | --------------------------------------------------------------------------------------- | ----------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ | -| aidatatang_200zh | Aidatatang_200zh A free Chinese Mandarin speech corpus | ASR | CMN | http://www.openslr.org/resources/62 | | -| aishell | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus | ASR | CMN | http://www.aishelltech.com/kysjcp | | -| aishell3 | AISHELL3 Mandarin multi-speaker text-to-speech | TTS | CMN | https://www.openslr.org/93/ | | -| ami | The AMI Meeting Corpus | ASR | ENG | http://groups.inf.ed.ac.uk/ami/corpus/ | | -| an4 | CMU AN4 database | ASR/TTS | ENG | http://www.speech.cs.cmu.edu/databases/an4/ | | -| babel | IARPA Babel corups | ASR | ~20 languages | https://www.iarpa.gov/index.php/research-programs/babel | | -| bn_openslr53 | Large bengali ASR training dataset | ASR | BEN | https://openslr.org/53/ | | +| aidatatang_200zh | Aidatatang_200zh A free Chinese Mandarin speech corpus | ASR | CMN | http://www.openslr.org/resources/62 | | +| aishell | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus | ASR | CMN | http://www.aishelltech.com/kysjcp | | +| aishell3 | AISHELL3 Mandarin multi-speaker text-to-speech | TTS | CMN | https://www.openslr.org/93/ | | +| ami | The AMI Meeting Corpus | ASR | ENG | http://groups.inf.ed.ac.uk/ami/corpus/ | | +| an4 | CMU AN4 database | ASR/TTS | ENG | http://www.speech.cs.cmu.edu/databases/an4/ | | +| babel | IARPA Babel corups | ASR | ~20 languages | https://www.iarpa.gov/index.php/research-programs/babel | | +| bn_openslr53 | Large bengali ASR training dataset | ASR | BEN | https://openslr.org/53/ | | | catslu | CATSLU-MAPS | SLU | CMN | https://sites.google.com/view/catslu/home | | -| chime4 | The 4th CHiME Speech Separation and Recognition Challenge | ASR/Multichannel ASR | ENG | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/ | | -| cmu_indic | CMU INDIC | TTS | 7 languages | http://festvox.org/cmu_indic/ | | +| chime4 | The 4th CHiME Speech Separation and Recognition Challenge | ASR/Multichannel ASR | ENG | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/ | | +| cmu_indic | CMU INDIC | TTS | 7 languages | http://festvox.org/cmu_indic/ | | | commonvoice | The Mozilla Common Voice | ASR | 13 languages | https://voice.mozilla.org/datasets | | -| csj | Corpus of Spontaneous Japanese | ASR | JPN | https://pj.ninjal.ac.jp/corpus_center/csj/en/ | | -| csmsc | Chinese Standard Mandarin Speech Copus | TTS | CMN | https://www.data-baker.com/open_source.html | | +| csj | Corpus of Spontaneous Japanese | ASR | JPN | https://pj.ninjal.ac.jp/corpus_center/csj/en/ | | +| csmsc | Chinese Standard Mandarin Speech Copus | TTS | CMN | https://www.data-baker.com/open_source.html | | | css10 | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages | TTS | 10 langauges | https://github.com/Kyubyong/css10 | | -| dirha_wsj | Distant-speech Interaction for Robust Home Applications | Multichannel ASR | ENG | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj | | -| dns_ins20 | Deep Noise Suppression Challenge – INTERSPEECH 2020 | SE | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ | | -| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation | ASR/ST | SPA->ENG | https://catalog.ldc.upenn.edu/LDC2014T23 | | -| fsc | Fluent Speech Commands Dataset | SLU | ENG | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/ | | -| fsc_unseen | Fluent Speech Commands Dataset MASE Eval Unseen splits | SLU | ENG | https://github.com/maseEval/mase | | -| fsc_challenge | Fluent Speech Commands Dataset MASE Eval Challenge splits | SLU | ENG | https://github.com/maseEval/mase | | -| gigaspeech | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR | ENG | https://github.com/SpeechColab/GigaSpeech | | -| grabo | Grabo dataset | SLU | ENG + NLD | https://www.esat.kuleuven.be/psi/spraak/downloads/ | | -| hkust | HKUST/MTS: A very large scale Mandarin telephone speech corpus | ASR | CMN | https://catalog.ldc.upenn.edu/LDC2005S15 | | -| hui_acg | HUI-audio-corpus-german | TTS | DEU | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german | | +| dirha_wsj | Distant-speech Interaction for Robust Home Applications | Multichannel ASR | ENG | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj | | +| dns_ins20 | Deep Noise Suppression Challenge – INTERSPEECH 2020 | SE | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ | | +| dsing | Automatic Lyric Transcription from Karaoke Vocal Tracks (From DAMP Sing300x30x2) | ASR (ALT) | ENG singing | https://github.com/groadabike/Kaldi-Dsing-task | | +| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation | ASR/ST | SPA->ENG | https://catalog.ldc.upenn.edu/LDC2014T23 | | +| fsc | Fluent Speech Commands Dataset | SLU | ENG | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/ | | +| fsc_unseen | Fluent Speech Commands Dataset MASE Eval Unseen splits | SLU | ENG | https://github.com/maseEval/mase | | +| fsc_challenge | Fluent Speech Commands Dataset MASE Eval Challenge splits | SLU | ENG | https://github.com/maseEval/mase | | +| gigaspeech | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR | ENG | https://github.com/SpeechColab/GigaSpeech | | +| grabo | Grabo dataset | SLU | ENG + NLD | https://www.esat.kuleuven.be/psi/spraak/downloads/ | | +| hkust | HKUST/MTS: A very large scale Mandarin telephone speech corpus | ASR | CMN | https://catalog.ldc.upenn.edu/LDC2005S15 | | +| hui_acg | HUI-audio-corpus-german | TTS | DEU | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german | | | how2 | How2: A Large-scale Dataset for Multimodal Language Understanding | ASR/MT/ST | ENG->POR | https://github.com/srvk/how2-dataset | | -| iemocap | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database | SLU | ENG | https://sail.usc.edu/iemocap/ | | -| iwslt21_low_resource | ALFFA, IARPA Babel, Gamayun, IWSLT 2021 | ASR | SWA | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource | | +| iemocap | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database | SLU | ENG | https://sail.usc.edu/iemocap/ | | +| iwslt21_low_resource | ALFFA, IARPA Babel, Gamayun, IWSLT 2021 | ASR | SWA | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource | | | jdcinal | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags | SLU | JPN | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip | | | jkac | J-KAC: Japanese Kamishibai and audiobook corpus | TTS | JPN | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus | | | jmd | JMD: Japanese multi-dialect corpus for speech synthesis | TTS | JPN | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus | | | jsss | JSSS: Japanese speech corpus for summarization and simplification | TTS | JPN | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus | | | jsut | Japanese speech corpus of Saruwatari-lab., University of Tokyo | ASR/TTS | JPN | https://sites.google.com/site/shinnosuketakamichi/publication/jsut | | -| jtubespeech | Japanese YouTube Speech corpus | ASR/TTS | JPN | | | +| jtubespeech | Japanese YouTube Speech corpus | ASR/TTS | JPN | | | | jv_openslr35 | Javanese | ASR | JAV | http://www.openslr.org/35 | | | jvs | JVS (Japanese versatile speech) corpus | TTS | JPN | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus | | | ksponspeech | KsponSpeech (Korean spontaneous speech) corpus | ASR | KOR | https://aihub.or.kr/aidata/105 | | @@ -49,26 +50,26 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2 | librimix | LibriMix: An Open-Source Dataset for Generalizable Speech Separation | SE | ENG | https://github.com/JorisCos/LibriMix | | | librispeech | LibriSpeech ASR corpus | ASR | ENG | http://www.openslr.org/12 | | | librispeech_100 | LibriSpeech ASR corpus 100h subset | ASR | ENG | http://www.openslr.org/12 | | -| libritts | LibriTTS corpus | TTS | ENG | http://www.openslr.org/60 | | +| libritts | LibriTTS corpus | TTS | ENG | http://www.openslr.org/60 | | | ljspeech | The LJ Speech Dataset | TTS | ENG | https://keithito.com/LJ-Speech-Dataset/ | | | lrs2 | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset | Lipreading/ASR | ENG | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html | | | mini_an4 | Mini version of CMU AN4 database for the integration test | ASR/TTS/SE | ENG | http://www.speech.cs.cmu.edu/databases/an4/ | | | mini_librispeech | Mini version of Librispeech corpus | DIAR | ENG | https://openslr.org/31/ | | -| mls | MLS (A large multilingual corpus derived from LibriVox audiobooks) | ASR | 8 languages | http://www.openslr.org/94/ | | +| mls | MLS (A large multilingual corpus derived from LibriVox audiobooks) | ASR | 8 languages | http://www.openslr.org/94/ | | | nsc | National Speech Corpus | ASR | ENG-SG | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus | | -| open_li52 | Corpus combination with 52 languages(Commonvocie + voxforge) | Multilingual ASR | 52 languages | | | +| open_li52 | Corpus combination with 52 languages(Commonvocie + voxforge) | Multilingual ASR | 52 languages | | | | polyphone_swiss_french | Swiss French Polyphone corpus | ASR | FRA | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02 | | | primewords_chinese | Primewords Chinese Corpus Set 1 | ASR | CMN | https://www.openslr.org/47/ | | -| puebla_nahuatl | Highland Puebla Nahuatl corpus (endangered language in central Mexico) | ASR | HPN | https://www.openslr.org/92/ | | +| puebla_nahuatl | Highland Puebla Nahuatl corpus (endangered language in central Mexico) | ASR | HPN | https://www.openslr.org/92/ | | | reverb | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge | ASR | ENG | https://reverb2014.dereverberation.com/ | | | ru_open_stt | Russian Open Speech To Text (STT/ASR) Dataset | ASR | RUS | https://github.com/snakers4/open_stt | | | ruslan | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis | TTS | RUS | https://ruslan-corpus.github.io/ | | | snips | SNIPS: A dataset for spoken language understanding | SLU | ENG | https://github.com/sonos/spoken-language-understanding-research-datasets | | | seame | SEAME: a Mandarin-English Code-switching Speech Corpus in South-East Asia | ASR | ENG + CMN | https://catalog.ldc.upenn.edu/LDC2015S04 | | -| siwis | SIWIS: Spoken Interaction with Interpretation in Switzerland | TTS | FRA | https://https://datashare.ed.ac.uk/handle/10283/2353 | | +| siwis | SIWIS: Spoken Interaction with Interpretation in Switzerland | TTS | FRA | https://https://datashare.ed.ac.uk/handle/10283/2353 | | | slue-voxceleb | SLUE: Spoken Language Understanding Evaluation | SLU | ENG | https://github.com/asappresearch/slue-toolkit | | | slurp | SLURP: A Spoken Language Understanding Resource Package | SLU | ENG | https://github.com/pswietojanski/slurp | | -| slurp_entity | SLURP: A Spoken Language Understanding Resource Package | SLU/Entity Classification | ENG | https://github.com/pswietojanski/slurp | | +| slurp_entity | SLURP: A Spoken Language Understanding Resource Package | SLU/Entity Classifi. | ENG | https://github.com/pswietojanski/slurp | | | sms_wsj | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms | SE | ENG | https://github.com/fgnt/sms_wsj | | | speechcommands | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition | SLU | ENG | https://www.tensorflow.org/datasets/catalog/speech_commands | | | spgispeech | SPGISpeech 5k corpus | ASR | ENG | https://datasets.kensho.com/datasets/scribe | | @@ -79,12 +80,12 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2 | tedlium2 | TED-LIUM corpus release 2 | ASR | ENG | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf | | | thchs30 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University | TTS | CMN | https://www.openslr.org/18/ | | | timit | TIMIT Acoustic-Phonetic Continuous Speech Corpus | ASR | ENG | https://catalog.ldc.upenn.edu/LDC93S1 | | -| totonac | Highland Totonac corpus (endangered language in central Mexico) | ASR | TOS | http://www.openslr.org/107/ | | -| tsukuyomi | つくよみちゃんコーパス | TTS | JPN | https://tyc.rei-yumesaki.net/material/corpus | | +| totonac | Highland Totonac corpus (endangered language in central Mexico) | ASR | TOS | http://www.openslr.org/107/ | | +| tsukuyomi | つくよみちゃんコーパス | TTS | JPN | https://tyc.rei-yumesaki.net/material/corpus | | | vctk | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit | TTS | ENG | http://www.udialogue.org/download/cstr-vctk-corpus.html | | | vctk_noisyreverb | Noisy reverberant speech database (48kHz) | SE | ENG | https://datashare.ed.ac.uk/handle/10283/2826 | | | vivos | VIVOS (Vietnamese corpus for ASR) | ASR | VIE | https://ailab.hcmus.edu.vn/vivos/ | | -| voxforge | VoxForge | ASR | 7 languages | http://www.voxforge.org/ | | +| voxforge | VoxForge | ASR | 7 languages | http://www.voxforge.org/ | | | wenetspeech | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition | ASR | CMN | https://wenet-e2e.github.io/WenetSpeech/ | | | wham | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset | SE | ENG | https://wham.whisper.ai/ | | | whamr | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation | SE | ENG | https://wham.whisper.ai/ | | @@ -94,3 +95,4 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2 | yesno | The "yesno" corpus | ASR | HEB | http://www.openslr.org/1 | | | yoloxochitl_mixtec | Yoloxochitl-Mixtec corpus (endangered language in central Mexico) | ASR | XTY | http://www.openslr.org/89 | | | zeroth_korean | Zeroth-Korean | ASR | KOR | http://www.openslr.org/40 | | + diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh index 3785aef57a8..88113b1d547 100755 --- a/egs2/TEMPLATE/asr1/db.sh +++ b/egs2/TEMPLATE/asr1/db.sh @@ -11,6 +11,7 @@ DIRHA_ENGLISH_PHDEV= DIRHA_WSJ= DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed" # Output file path DNS= +DSING=downloads WSJ0= WSJ1= WSJCAM0= @@ -159,6 +160,7 @@ if [[ "$(hostname)" == tir* ]]; then IWSLT22_DIALECT=/projects/tir5/data/speech_corpora/LDC2022E01_IWSLT22_Tunisian_Arabic_Shared_Task_Training_Data/ PRIMEWORDS_CHINESE=/projects/tir5/data/speech_corpora/Primewords_Chinese FISHER_CALLHOME_SPANISH=/projects/tir5/data/speech_corpora/fisher_callhome_spanish + DSING=/projects/tir5/data/speech_corpora/sing_300x30x2 fi # For only JHU environment diff --git a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py index 6b14ac4ec97..552c84f89ad 100644 --- a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py +++ b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py @@ -39,13 +39,13 @@ class ApplyKmeans(object): def __init__(self, km_path): self.km_model = joblib.load(km_path) self.nc = self.km_model.cluster_centers_.transpose() - self.nc_norm = (self.nc ** 2).sum(0, keepdims=True) + self.nc_norm = (self.nc**2).sum(0, keepdims=True) def __call__(self, x): if isinstance(x, torch.Tensor): x = x.cpu().numpy() probs = ( - (x ** 2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm + (x**2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm ) return np.argmin(probs, axis=1) diff --git a/egs2/dsing/asr1/asr.sh b/egs2/dsing/asr1/asr.sh new file mode 120000 index 00000000000..60b05122cfd --- /dev/null +++ b/egs2/dsing/asr1/asr.sh @@ -0,0 +1 @@ +../../TEMPLATE/asr1/asr.sh \ No newline at end of file diff --git a/egs2/dsing/asr1/cmd.sh b/egs2/dsing/asr1/cmd.sh new file mode 100644 index 00000000000..2aae6919fef --- /dev/null +++ b/egs2/dsing/asr1/cmd.sh @@ -0,0 +1,110 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time