diff --git a/egs2/README.md b/egs2/README.md
index dcbd80bf5b9..2b9bdbbca27 100755
--- a/egs2/README.md
+++ b/egs2/README.md
@@ -8,39 +8,40 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 
 | Directory name          | Corpus name                                                                             | Task                    | Language              | URL                                                                                                          | Note         |
 | ----------------------- | --------------------------------------------------------------------------------------- | ----------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ |
-| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                  | ASR                     | CMN                  | http://www.openslr.org/resources/62                                                                          |              |
-| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                  | ASR                     | CMN                  | http://www.aishelltech.com/kysjcp                                                                            |              |
-| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                          | TTS                     | CMN                  | https://www.openslr.org/93/                                                                                  |              |
-| ami                     | The AMI Meeting Corpus                                                                  | ASR                     | ENG                  | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
-| an4                     | CMU AN4 database                                                                        | ASR/TTS                 | ENG                 | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
-| babel                   | IARPA Babel corups                                                                      | ASR                     | ~20 languages        | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
-| bn_openslr53            | Large bengali ASR training dataset                                                      | ASR                     | BEN                  | https://openslr.org/53/                                                                                      |              |
+| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                  | ASR                     | CMN                   | http://www.openslr.org/resources/62                                                                          |              |
+| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                  | ASR                     | CMN                   | http://www.aishelltech.com/kysjcp                                                                            |              |
+| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                          | TTS                     | CMN                   | https://www.openslr.org/93/                                                                                  |              |
+| ami                     | The AMI Meeting Corpus                                                                  | ASR                     | ENG                   | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
+| an4                     | CMU AN4 database                                                                        | ASR/TTS                 | ENG                   | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| babel                   | IARPA Babel corups                                                                      | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
+| bn_openslr53            | Large bengali ASR training dataset                                                      | ASR                     | BEN                   | https://openslr.org/53/                                                                                      |              |
 | catslu               	  | CATSLU-MAPS                                                                             | SLU                     | CMN           	      | https://sites.google.com/view/catslu/home                                                                    |              |
-| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                               | ASR/Multichannel ASR    | ENG                  | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
-| cmu_indic               | CMU INDIC                                                                               | TTS                     | 7 languages          | http://festvox.org/cmu_indic/                                                                                |              |
+| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                               | ASR/Multichannel ASR    | ENG                   | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
+| cmu_indic               | CMU INDIC                                                                               | TTS                     | 7 languages           | http://festvox.org/cmu_indic/                                                                                |              |
 | commonvoice             | The Mozilla Common Voice                                                                | ASR                     | 13 languages          | https://voice.mozilla.org/datasets                                                                           |              |
-| csj                     | Corpus of Spontaneous Japanese                                                          | ASR                     | JPN                  | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
-| csmsc                   | Chinese Standard Mandarin Speech Copus                                                  | TTS                     | CMN                  | https://www.data-baker.com/open_source.html                                                                  |              |
+| csj                     | Corpus of Spontaneous Japanese                                                          | ASR                     | JPN                   | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
+| csmsc                   | Chinese Standard Mandarin Speech Copus                                                  | TTS                     | CMN                   | https://www.data-baker.com/open_source.html                                                                  |              |
 | css10                   | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages                  | TTS                     | 10 langauges          | https://github.com/Kyubyong/css10                                                                            |              |
-| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                 | Multichannel ASR        | ENG                  | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
-| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                 | SE                      | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
-| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                 | ASR/ST                  | SPA->ENG             | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
-| fsc                     | Fluent Speech Commands Dataset                                                          | SLU                     | ENG                  | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
-| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                         | SLU                     | ENG                    | https://github.com/maseEval/mase                                                                      |              |
-| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                                         | SLU                     | ENG                 | https://github.com/maseEval/mase                                                                   |              |
-| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR                     | ENG                  | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
-| grabo                   | Grabo dataset                                                                           | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |               |
-| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                          | ASR                     | CMN                  | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
-| hui_acg                 | HUI-audio-corpus-german                                                                 | TTS                     | DEU                  | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
+| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                 | Multichannel ASR        | ENG                   | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
+| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                     | SE                      | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
+| dsing                   | Automatic Lyric Transcription from Karaoke Vocal Tracks (From DAMP Sing300x30x2)        | ASR (ALT)               | ENG singing           | https://github.com/groadabike/Kaldi-Dsing-task                                                               |              |
+| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                 | ASR/ST                  | SPA->ENG              | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
+| fsc                     | Fluent Speech Commands Dataset                                                          | SLU                     | ENG                   | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
+| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                  | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                               | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR                     | ENG                   | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
+| grabo                   | Grabo dataset                                                                           | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |              |
+| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                          | ASR                     | CMN                   | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
+| hui_acg                 | HUI-audio-corpus-german                                                                 | TTS                     | DEU                   | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
 | how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                       | ASR/MT/ST               | ENG->POR              | https://github.com/srvk/how2-dataset                                                                         |              |
-| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database              | SLU                     | ENG                  | https://sail.usc.edu/iemocap/                                                                                |              |
-| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                 | ASR                     | SWA                  | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
+| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database              | SLU                     | ENG                   | https://sail.usc.edu/iemocap/                                                                                |              |
+| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                 | ASR                     | SWA                   | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
 | jdcinal                 | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags              | SLU               | JPN                   | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip |              |
 | jkac                    | J-KAC: Japanese Kamishibai and audiobook corpus                                         | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus                               |              |
 | jmd                     | JMD: Japanese multi-dialect corpus for speech synthesis                                 | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus                                 |              |
 | jsss                    | JSSS: Japanese speech corpus for summarization and simplification                       | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus                                |              |
 | jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo                          | ASR/TTS                 | JPN                  | https://sites.google.com/site/shinnosuketakamichi/publication/jsut                                           |              |
-| jtubespeech             | Japanese YouTube Speech corpus                                                          | ASR/TTS                 | JPN                  |                                                                                                              |              |
+| jtubespeech             | Japanese YouTube Speech corpus                                                          | ASR/TTS                 | JPN                  |                                                                                                             |              |
 | jv_openslr35            | Javanese                                                                                | ASR                     | JAV                  | http://www.openslr.org/35                                                                                    |              |
 | jvs                     | JVS (Japanese versatile speech) corpus                                                  | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus                                 |              |
 | ksponspeech             | KsponSpeech (Korean spontaneous speech) corpus                                          | ASR                     | KOR                  | https://aihub.or.kr/aidata/105                                                                               |              |
@@ -49,26 +50,26 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                    | SE                      | ENG                  | https://github.com/JorisCos/LibriMix                                                                         |              |
 | librispeech             | LibriSpeech ASR corpus                                                                  | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
 | librispeech_100         | LibriSpeech ASR corpus 100h subset                                                      | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
-| libritts                | LibriTTS corpus                                                                         | TTS                     | ENG                 | http://www.openslr.org/60                                                                                    |              |
+| libritts                | LibriTTS corpus                                                                         | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
 | ljspeech                | The LJ Speech Dataset                                                                   | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
 | lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                   | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
 | mini_an4                | Mini version of CMU AN4 database for the integration test                               | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
 | mini_librispeech        | Mini version of Librispeech corpus                                                      | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |
-| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                      | ASR                     | 8 languages           | http://www.openslr.org/94/                                                                                   |              |
+| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                      | ASR                     | 8 languages          | http://www.openslr.org/94/                                                                                   |              |
 | nsc                     | National Speech Corpus                                                                  | ASR                     | ENG-SG               | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus                        |              |
-| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                            | Multilingual ASR        | 52 languages          |                                                                                                              |              |
+| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                            | Multilingual ASR        | 52 languages         |                                                                                                             |              |
 | polyphone_swiss_french  | Swiss French Polyphone corpus                                                           | ASR                     | FRA                  | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02                                               |              |
 | primewords_chinese      | Primewords Chinese Corpus Set 1                                                         | ASR                     | CMN                  | https://www.openslr.org/47/                                                                                  |              |
-| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                  | ASR                     | HPN                   | https://www.openslr.org/92/                                                                                  |              |
+| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                  | ASR                     | HPN                  | https://www.openslr.org/92/                                                                                  |              |
 | reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge              | ASR                     | ENG                  | https://reverb2014.dereverberation.com/                                                                      |              |
 | ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                           | ASR                     | RUS                  | https://github.com/snakers4/open_stt                                                                         |              |
 | ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                             | TTS                     | RUS                  | https://ruslan-corpus.github.io/                                                                             |              |
 | snips                   | SNIPS: A dataset for spoken language understanding                                      | SLU                     | ENG                  | https://github.com/sonos/spoken-language-understanding-research-datasets                                     |              |
 | seame                   | SEAME: a Mandarin-English Code-switching Speech Corpus in South-East Asia               | ASR                     | ENG + CMN            | https://catalog.ldc.upenn.edu/LDC2015S04                                                                     |              |
-| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                            | TTS                     | FRA                  | https://https://datashare.ed.ac.uk/handle/10283/2353                                                         |              |
+| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                            | TTS                     | FRA                  | https://https://datashare.ed.ac.uk/handle/10283/2353                                                                 |              |
 | slue-voxceleb           | SLUE: Spoken Language Understanding Evaluation                                          | SLU                     | ENG                  | https://github.com/asappresearch/slue-toolkit                                                                |              |
 | slurp                   | SLURP: A Spoken Language Understanding Resource Package                                 | SLU                     | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
-| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                 | SLU/Entity Classification | ENG                  | https://github.com/pswietojanski/slurp                                                                     |              |
+| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                 | SLU/Entity Classifi.    | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
 | sms_wsj                 | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms | SE                      | ENG                  | https://github.com/fgnt/sms_wsj                                                                              |              |
 | speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                    | SLU                     | ENG                  | https://www.tensorflow.org/datasets/catalog/speech_commands                                                  |              |
 | spgispeech              | SPGISpeech 5k corpus                                                                    | ASR                     | ENG                  | https://datasets.kensho.com/datasets/scribe                                                                  |              |
@@ -79,12 +80,12 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | tedlium2                | TED-LIUM corpus release 2                                                               | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
 | thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                       | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
 | timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
-| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                         | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                   |              |
-| tsukuyomi               | つくよみちゃんコーパス                                       | TTS                     | JPN                 | https://tyc.rei-yumesaki.net/material/corpus                                                                   |              |
+| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                         | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
+| tsukuyomi               | つくよみちゃんコーパス                                                                      | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
 | vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | TTS                     | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
 | vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                               | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
 | vivos                   | VIVOS (Vietnamese corpus for ASR)                                                       | ASR                     | VIE                  | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
-| voxforge                | VoxForge                                                                                | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                      |              |
+| voxforge                | VoxForge                                                                                | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                     |              |
 | wenetspeech             | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition          | ASR                     | CMN                  | https://wenet-e2e.github.io/WenetSpeech/                                                                     |              |
 | wham                    | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset                                       | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
 | whamr                   | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation                          | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
@@ -94,3 +95,4 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | yesno                   | The "yesno" corpus                                                                      | ASR                     | HEB                  | http://www.openslr.org/1                                                                                     |              |
 | yoloxochitl_mixtec      | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                       | ASR                     | XTY                  | http://www.openslr.org/89                                                                                    |              |
 | zeroth_korean           | Zeroth-Korean                                                                           | ASR                     | KOR                  | http://www.openslr.org/40                                                                                    |              |
+
diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
index 3785aef57a8..88113b1d547 100755
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -11,6 +11,7 @@ DIRHA_ENGLISH_PHDEV=
 DIRHA_WSJ=
 DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path
 DNS=
+DSING=downloads
 WSJ0=
 WSJ1=
 WSJCAM0=
@@ -159,6 +160,7 @@ if [[ "$(hostname)" == tir* ]]; then
     IWSLT22_DIALECT=/projects/tir5/data/speech_corpora/LDC2022E01_IWSLT22_Tunisian_Arabic_Shared_Task_Training_Data/
     PRIMEWORDS_CHINESE=/projects/tir5/data/speech_corpora/Primewords_Chinese
     FISHER_CALLHOME_SPANISH=/projects/tir5/data/speech_corpora/fisher_callhome_spanish
+    DSING=/projects/tir5/data/speech_corpora/sing_300x30x2
 fi
 
 # For only JHU environment
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
index 6b14ac4ec97..552c84f89ad 100644
--- a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
+++ b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
@@ -39,13 +39,13 @@ class ApplyKmeans(object):
     def __init__(self, km_path):
         self.km_model = joblib.load(km_path)
         self.nc = self.km_model.cluster_centers_.transpose()
-        self.nc_norm = (self.nc ** 2).sum(0, keepdims=True)
+        self.nc_norm = (self.nc**2).sum(0, keepdims=True)
 
     def __call__(self, x):
         if isinstance(x, torch.Tensor):
             x = x.cpu().numpy()
         probs = (
-            (x ** 2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm
+            (x**2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm
         )
         return np.argmin(probs, axis=1)
 
diff --git a/egs2/dsing/asr1/asr.sh b/egs2/dsing/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/dsing/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/cmd.sh b/egs2/dsing/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dsing/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dsing/asr1/conf/decode_asr.yaml b/egs2/dsing/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/dsing/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/dsing/asr1/conf/fbank.conf b/egs2/dsing/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/dsing/asr1/conf/pbs.conf b/egs2/dsing/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dsing/asr1/conf/pitch.conf b/egs2/dsing/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/dsing/asr1/conf/queue.conf b/egs2/dsing/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dsing/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dsing/asr1/conf/slurm.conf b/egs2/dsing/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dsing/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dsing/asr1/conf/train_asr.yaml b/egs2/dsing/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/dsing/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/dsing/asr1/conf/train_lm.yaml b/egs2/dsing/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
new file mode 100644
index 00000000000..4ec26c01907
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
@@ -0,0 +1,79 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..545cd8a8333
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/dsing/asr1/db.sh b/egs2/dsing/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dsing/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/local/data.sh b/egs2/dsing/asr1/local/data.sh
new file mode 100644
index 00000000000..ee9c82872b7
--- /dev/null
+++ b/egs2/dsing/asr1/local/data.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon  University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+dsing=1
+
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${DSING}
+if [ -z "${DSING}" ]; then
+    log "Fill the value of 'DSING' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Prepare stage1: Download data to ${DSING}"
+    echo "Please download the data at https://ccrma.stanford.edu/damp/"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "Prepare stage2: segmentation setup for Dsing"
+    if [ -d "local/dsing_task" ]; then
+       echo "exist segmetation, skip git clone"
+    else
+        git clone https://github.com/groadabike/Kaldi-Dsing-task.git local/dsing_task
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "Prepare stage3: dataset prepare"
+    for datadir in ${train_set} ${train_dev} ${test_set}; do
+        python local/data_prep.py data/ ${DSING}/sing_300x30x2 local/dsing_task/DSing\ Kaldi\ Recipe/dsing/s5/conf/${datadir}.json ${datadir}
+        utils/utt2spk_to_spk2utt.pl data/${datadir}/utt2spk > data/${datadir}/spk2utt
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/dsing/asr1/local/data_prep.py b/egs2/dsing/asr1/local/data_prep.py
new file mode 100644
index 00000000000..6675d31ae5c
--- /dev/null
+++ b/egs2/dsing/asr1/local/data_prep.py
@@ -0,0 +1,189 @@
+# Source from https://github.com/groadabike/Kaldi-Dsing-task
+
+import json
+import argparse
+from os.path import join, exists, isfile
+from os import makedirs, listdir
+import re
+import hashlib
+
+
+class DataSet:
+    def __init__(self, name, workspace, db_path):
+        self.segments = []
+        self.spk2gender = []
+        self.text = []
+        self.utt2spk = []
+        self.wavscp = []
+        self.workspace = join(workspace, name)
+        self.db_path = db_path
+
+    def add_utterance(self, utt, recording):
+
+        text = utt["text"]
+        arrangement, performance, country, gender, user = recording[:-4].split("-")
+
+        # the following mapping is necessary for errors in gender in country IN
+        insensitive_none = re.compile(re.escape("none"), re.IGNORECASE)
+
+        gender = insensitive_none.sub("", utt["gender"])
+        spk = "{}{}".format(
+            insensitive_none.sub("", gender).upper(), insensitive_none.sub("", user)
+        )
+
+        rec_id = recording[:-4]
+        utt_id = "{}-{}-{}-{}-{}-{:03}".format(
+            spk, arrangement, performance, country, gender.upper(), utt["index"]
+        )
+
+        start = utt["start"]
+        end = utt["end"]
+
+        wavpath = join(country, "{}{}".format(country, "Vocals"), recording)
+
+        self._add_segment(utt_id, rec_id, start, end)
+        self._add_spk2gender(spk, gender)
+        self._add_text(utt_id, text)
+        self._add_utt2spk(utt_id, spk)
+        self._add_wavscp(rec_id, wavpath)
+
+    def _add_segment(self, rec_id, utt_id, start, end):
+        self.segments.append("{} {} {:.3f} {:.3f}".format(rec_id, utt_id, start, end))
+
+    def _add_spk2gender(self, spk, gender):
+        self.spk2gender.append("{} {}".format(spk, gender))
+
+    def _add_text(self, utt_id, text):
+        self.text.append("{} {}".format(utt_id, text))
+
+    def _add_utt2spk(self, utt_id, spk):
+        self.utt2spk.append("{} {}".format(utt_id, spk))
+
+    def _add_wavscp(self, rec_id, wavpath):
+        self.wavscp.append(
+            "{} sox {}/{} -G -t wav -r 16000 -c 1 - remix 1 | ".format(
+                rec_id, db_path, wavpath
+            )
+        )
+
+    def list2file(self, outfile, list_data):
+        list_data = list(set(list_data))
+        with open(outfile, "w") as f:
+            for line in list_data:
+                f.write("{}\n".format(line))
+
+    def save(self):
+        if not exists(self.workspace):
+            makedirs(self.workspace)
+        self.list2file(join(self.workspace, "spk2gender"), sorted(self.spk2gender))
+        self.list2file(join(self.workspace, "text"), sorted(self.text))
+        self.list2file(join(self.workspace, "wav.scp"), sorted(self.wavscp))
+        self.list2file(join(self.workspace, "utt2spk"), sorted(self.utt2spk))
+        self.list2file(join(self.workspace, "segments"), sorted(self.segments))
+
+
+def read_json(filepath):
+    try:  # Read the json
+        with open(filepath) as data_file:
+            data = json.load(data_file)
+    except json.decoder.JSONDecodeError:
+        # Json has an extra first line. Error when was created
+        data = []
+
+    return data
+
+
+def map_rec2chec(db_path, countries):
+    """
+    Method read all the original audio tracks and create a dict
+            {<checksum>: <recording>}
+    :param db_path: string, path to root of DAMP Sing!
+    :return: dict
+    """
+    rec2chec = {}
+    for country in countries:
+        recordings = [
+            f
+            for f in listdir(join(db_path, country, country + "Vocals"))
+            if f.endswith(".m4a")
+        ]
+        for record in recordings:
+            rec2chec[
+                hashlib.md5(
+                    open(
+                        join(db_path, country, country + "Vocals", record), "rb"
+                    ).read()
+                ).hexdigest()
+            ] = record
+
+    return rec2chec
+
+
+def main(args):
+    db_path = args.db_path
+    workspace = args.workspace
+    utts_path = args.utterances
+    dset = args.dset
+
+    countries = ["GB"]
+    countries += ["US", "AU"] if dset in ["train3", "train30"] else []
+    countries += (
+        [
+            "AE",
+            "AR",
+            "BR",
+            "CL",
+            "CN",
+            "DE",
+            "ES",
+            "FR",
+            "HU",
+            "ID",
+            "IN",
+            "IQ",
+            "IR",
+            "IT",
+            "JP",
+            "KR",
+            "MX",
+            "MY",
+            "NO",
+            "PH",
+            "PT",
+            "RU",
+            "SA",
+            "SG",
+            "TH",
+            "VN",
+            "ZA",
+        ]
+        if dset in ["train30"]
+        else []
+    )
+
+    performances = map_rec2chec(db_path, countries)
+    utterances = read_json(utts_path)
+    dataset = DataSet(dset, workspace, db_path)
+
+    for utt in utterances:
+        dataset.add_utterance(utt, performances[utt["wavfile"]])
+
+    dataset.save()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "workspace", type=str, help="Path where the output files will be saved"
+    )
+    parser.add_argument("db_path", type=str, help="Path to DAMP 300x30x2 database")
+    parser.add_argument(
+        "utterances",
+        type=str,
+        help="Path to utterance details in json format",
+        default="metadata.json",
+    )
+    parser.add_argument("dset", type=str, help="Name of the dataset")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/egs2/dsing/asr1/local/path.sh b/egs2/dsing/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dsing/asr1/path.sh b/egs2/dsing/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/dsing/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/pyscripts b/egs2/dsing/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dsing/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/run.sh b/egs2/dsing/asr1/run.sh
new file mode 100755
index 00000000000..2ce6a7d68bb
--- /dev/null
+++ b/egs2/dsing/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dsing=30  # Set: 1  for DSing1
+          #    3  for DSing3
+          #    30 for DSing30
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=500
+
+./asr.sh \
+    --ngpu 1 \
+    --stage 1 \
+    --stop_stage 100 \
+    --local_data_opts "--dsing ${dsing}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/dsing/asr1/scripts b/egs2/dsing/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/dsing/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/steps b/egs2/dsing/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dsing/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dsing/asr1/utils b/egs2/dsing/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dsing/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file