From c12afc5afb0569b9d419d913cc60664665cf6ef3 Mon Sep 17 00:00:00 2001
From: Junbo Zhang <zhangjunbo1@xiaomi.com>
Date: Sun, 22 Aug 2021 10:08:56 +0800
Subject: [PATCH 1/7] update GigaSpeech downloading

---
 examples/gigaspeech/gigaspeech.ipynb   | 9 ++++-----
 lhotse/bin/modes/recipes/gigaspeech.py | 4 +++-
 lhotse/recipes/gigaspeech.py           | 6 ++----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/examples/gigaspeech/gigaspeech.ipynb b/examples/gigaspeech/gigaspeech.ipynb
index 1d6e3731d..353335b30 100644
--- a/examples/gigaspeech/gigaspeech.ipynb
+++ b/examples/gigaspeech/gigaspeech.ipynb
@@ -94,12 +94,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "password = ''",
+    "# You need to fill out the Google Form to get the password",
+    "# https://forms.gle/UuGQAPyscGRrUMLq6\n",
     "for part in dataset_parts:\n",
-    "    # TODO: remove this try-except block in the stable version\n",
-    "    try:\n",
-    "        gigaspeech.download(part)\n",
-    "    except NotImplementedError:\n",
-    "        assert gigaspeech.json_path.is_file()"
+    "    gigaspeech.download(password, part)"
    ]
   },
   {
diff --git a/lhotse/bin/modes/recipes/gigaspeech.py b/lhotse/bin/modes/recipes/gigaspeech.py
index 25ffcf91e..297c771ab 100644
--- a/lhotse/bin/modes/recipes/gigaspeech.py
+++ b/lhotse/bin/modes/recipes/gigaspeech.py
@@ -29,10 +29,12 @@ def gigaspeech(
 
 
 @download.command(context_settings=dict(show_default=True))
+@click.argument('password', type=str)
 @click.argument('target_dir', type=click.Path())
 @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True,
               default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).')
 def gigaspeech(
+        password: str,
         target_dir: Pathlike,
         subset: List[str]
 ):
@@ -41,4 +43,4 @@ def gigaspeech(
     logging.basicConfig(level=logging.INFO)
     if 'auto' in subset:
         subset = 'auto'
-    download_gigaspeech(target_dir, dataset_parts=subset)
+    download_gigaspeech(password, target_dir, dataset_parts=subset)
diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
index cb05fc312..134c3da6a 100644
--- a/lhotse/recipes/gigaspeech.py
+++ b/lhotse/recipes/gigaspeech.py
@@ -24,6 +24,7 @@
 
 
 def download_gigaspeech(
+        password: str,
         target_dir: Pathlike = '.',
         dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
 ):
@@ -41,10 +42,7 @@ def download_gigaspeech(
 
     for part in dataset_parts:
         logging.info(f'Downloading GigaSpeech part: {part}')
-        try:
-            gigaspeech.download('{' + part + '}')
-        except NotImplementedError:
-            raise ValueError(f"Could not download GigaSpeech part {part} -- speechcolab raised NotImplementedError.")
+        gigaspeech.download(password, '{' + part + '}')
 
 
 def prepare_gigaspeech(

From 9ea89323e9b1b28edf133766bfccebb7f81db2d9 Mon Sep 17 00:00:00 2001
From: Junbo Zhang <zhangjunbo1@xiaomi.com>
Date: Mon, 23 Aug 2021 08:51:16 +0800
Subject: [PATCH 2/7] update the gigaspeech recipe

---
 examples/gigaspeech/gigaspeech.ipynb   | 17 +++++++----------
 lhotse/bin/modes/recipes/gigaspeech.py | 10 ++++++----
 lhotse/recipes/gigaspeech.py           | 16 +++++++---------
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/examples/gigaspeech/gigaspeech.ipynb b/examples/gigaspeech/gigaspeech.ipynb
index 353335b30..e54adbd1e 100644
--- a/examples/gigaspeech/gigaspeech.ipynb
+++ b/examples/gigaspeech/gigaspeech.ipynb
@@ -12,13 +12,12 @@
     "from pathlib import Path\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
-    "from speechcolab.datasets.gigaspeech import GigaSpeech\n",
     "\n",
     "from lhotse import CutSet, Fbank, LilcomFilesWriter\n",
-    "from lhotse.augmentation import SoxEffectTransform, RandomValue, pitch, reverb, speed\n",
+    "from lhotse.augmentation import SoxEffectTransform, RandomValue\n",
     "from lhotse.dataset import K2SpeechRecognitionDataset\n",
     "from lhotse.dataset.sampling import SingleCutSampler\n",
-    "from lhotse.recipes.gigaspeech import prepare_gigaspeech"
+    "from lhotse.recipes.gigaspeech import download_gigaspeech, prepare_gigaspeech"
    ]
   },
   {
@@ -84,7 +83,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "gigaspeech = GigaSpeech(corpus_dir)"
+    "password = ''",
+    "# You need to fill out the Google Form to get the password",
+    "# https://forms.gle/UuGQAPyscGRrUMLq6\n"
    ]
   },
   {
@@ -94,11 +95,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "password = ''",
-    "# You need to fill out the Google Form to get the password",
-    "# https://forms.gle/UuGQAPyscGRrUMLq6\n",
-    "for part in dataset_parts:\n",
-    "    gigaspeech.download(password, part)"
+    "download_gigaspeech(password, corpus_dir, dataset_parts)"
    ]
   },
   {
@@ -128,7 +125,7 @@
    },
    "outputs": [],
    "source": [
-    "gigaspeech_manifests = prepare_gigaspeech(gigaspeech, dataset_parts, output_dir, num_jobs=num_jobs)"
+    "gigaspeech_manifests = prepare_gigaspeech(corpus_dir, dataset_parts, output_dir, num_jobs=num_jobs)"
    ]
   },
   {
diff --git a/lhotse/bin/modes/recipes/gigaspeech.py b/lhotse/bin/modes/recipes/gigaspeech.py
index 297c771ab..5fd99361a 100644
--- a/lhotse/bin/modes/recipes/gigaspeech.py
+++ b/lhotse/bin/modes/recipes/gigaspeech.py
@@ -12,7 +12,7 @@
 @click.argument('corpus_dir', type=click.Path(exists=True, dir_okay=True))
 @click.argument('output_dir', type=click.Path())
 @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True,
-              default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).')
+              default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).')
 @click.option('-j', '--num-jobs', type=int, default=1,
               help='How many threads to use (can give good speed-ups with slow disks).')
 def gigaspeech(
@@ -32,15 +32,17 @@ def gigaspeech(
 @click.argument('password', type=str)
 @click.argument('target_dir', type=click.Path())
 @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True,
-              default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).')
+              default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).')
+@click.option('--host', type=str, default='tsinghua', help='Which host to download Gigaspeech.')
 def gigaspeech(
         password: str,
         target_dir: Pathlike,
-        subset: List[str]
+        subset: List[str],
+        host: str
 ):
     """Gigaspeech download."""
     # Convert (likely one-element) list with "auto" into a string.
     logging.basicConfig(level=logging.INFO)
     if 'auto' in subset:
         subset = 'auto'
-    download_gigaspeech(password, target_dir, dataset_parts=subset)
+    download_gigaspeech(password, target_dir, dataset_parts=subset, host=host)
diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
index 134c3da6a..90f7f89cb 100644
--- a/lhotse/recipes/gigaspeech.py
+++ b/lhotse/recipes/gigaspeech.py
@@ -20,13 +20,12 @@
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, Seconds, is_module_available
 
-GIGASPEECH_PARTS = ('XL', 'L', 'M', 'S', 'XS', 'DEV', 'TEST')
-
 
 def download_gigaspeech(
         password: str,
         target_dir: Pathlike = '.',
         dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
+        host: Optional[str] = 'tsinghua'
 ):
     if is_module_available('speechcolab'):
         from speechcolab.datasets.gigaspeech import GigaSpeech
@@ -35,14 +34,13 @@ def download_gigaspeech(
             'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')
     gigaspeech = GigaSpeech(target_dir)
 
-    if dataset_parts == 'auto':
-        dataset_parts = ('XL', 'DEV', 'TEST')
-    elif isinstance(dataset_parts, str):
-        dataset_parts = [dataset_parts]
+    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts
+    if isinstance(subsets, str):
+        subsets = [subsets]
 
-    for part in dataset_parts:
+    for part in subsets:
         logging.info(f'Downloading GigaSpeech part: {part}')
-        gigaspeech.download(password, '{' + part + '}')
+        gigaspeech.download(password, part, host=host)
 
 
 def prepare_gigaspeech(
@@ -57,7 +55,7 @@ def prepare_gigaspeech(
         raise ImportError(
             'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')
 
-    subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts
+    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts
     if isinstance(subsets, str):
         subsets = [subsets]
     corpus_dir = Path(corpus_dir)

From b4912feea0fdf31bc57598ea7d81ba3354f6ce0c Mon Sep 17 00:00:00 2001
From: Junbo Zhang <zhangjunbo1@xiaomi.com>
Date: Mon, 23 Aug 2021 09:00:49 +0800
Subject: [PATCH 3/7] update `prepare_gigaspeech`

---
 lhotse/recipes/gigaspeech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
index 90f7f89cb..1ceee2342 100644
--- a/lhotse/recipes/gigaspeech.py
+++ b/lhotse/recipes/gigaspeech.py
@@ -89,7 +89,7 @@ def prepare_gigaspeech(
             recordings = []
             supervisions = []
             for recording, segments in tqdm(
-                    ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.root_path)),
+                    ex.map(parse_utterance, gigaspeech.audios(part), repeat(gigaspeech.gigaspeech_dataset_dir)),
                     desc='Processing GigaSpeech JSON entries'
             ):
                 recordings.append(recording)

From 939bab9b761ef820a826b43e95a0070f9739b3cd Mon Sep 17 00:00:00 2001
From: Junbo Zhang <zhangjunbo1@xiaomi.com>
Date: Mon, 23 Aug 2021 16:33:31 +0800
Subject: [PATCH 4/7] Remove brackets from subset names

---
 examples/gigaspeech/gigaspeech.ipynb   | 6 +++---
 lhotse/bin/modes/recipes/gigaspeech.py | 4 ++--
 lhotse/recipes/gigaspeech.py           | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/gigaspeech/gigaspeech.ipynb b/examples/gigaspeech/gigaspeech.ipynb
index e54adbd1e..2081ac7b3 100644
--- a/examples/gigaspeech/gigaspeech.ipynb
+++ b/examples/gigaspeech/gigaspeech.ipynb
@@ -55,7 +55,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_set = '{XS}'"
+    "train_set = 'XS'"
    ]
   },
   {
@@ -65,7 +65,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset_parts = (train_set, '{TEST}')"
+    "dataset_parts = (train_set, 'TEST')"
    ]
   },
   {
@@ -204,7 +204,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['{TEST}'])\n",
+    "dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['TEST'])\n",
     "dataset_train = K2SpeechRecognitionDataset(gigaspeech_manifests[train_set])"
    ]
   },
diff --git a/lhotse/bin/modes/recipes/gigaspeech.py b/lhotse/bin/modes/recipes/gigaspeech.py
index 5fd99361a..e0e733283 100644
--- a/lhotse/bin/modes/recipes/gigaspeech.py
+++ b/lhotse/bin/modes/recipes/gigaspeech.py
@@ -12,7 +12,7 @@
 @click.argument('corpus_dir', type=click.Path(exists=True, dir_okay=True))
 @click.argument('output_dir', type=click.Path())
 @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True,
-              default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).')
+              default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).')
 @click.option('-j', '--num-jobs', type=int, default=1,
               help='How many threads to use (can give good speed-ups with slow disks).')
 def gigaspeech(
@@ -32,7 +32,7 @@ def gigaspeech(
 @click.argument('password', type=str)
 @click.argument('target_dir', type=click.Path())
 @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True,
-              default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).')
+              default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).')
 @click.option('--host', type=str, default='tsinghua', help='Which host to download Gigaspeech.')
 def gigaspeech(
         password: str,
diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
index 1ceee2342..d3dc281c4 100644
--- a/lhotse/recipes/gigaspeech.py
+++ b/lhotse/recipes/gigaspeech.py
@@ -34,13 +34,13 @@ def download_gigaspeech(
             'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')
     gigaspeech = GigaSpeech(target_dir)
 
-    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts
+    subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts
     if isinstance(subsets, str):
         subsets = [subsets]
 
     for part in subsets:
         logging.info(f'Downloading GigaSpeech part: {part}')
-        gigaspeech.download(password, part, host=host)
+        gigaspeech.download(password, '{' + part + '}', host=host)
 
 
 def prepare_gigaspeech(
@@ -55,7 +55,7 @@ def prepare_gigaspeech(
         raise ImportError(
             'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')
 
-    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts
+    subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts
     if isinstance(subsets, str):
         subsets = [subsets]
     corpus_dir = Path(corpus_dir)
@@ -89,7 +89,7 @@ def prepare_gigaspeech(
             recordings = []
             supervisions = []
             for recording, segments in tqdm(
-                    ex.map(parse_utterance, gigaspeech.audios(part), repeat(gigaspeech.gigaspeech_dataset_dir)),
+                    ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.gigaspeech_dataset_dir)),
                     desc='Processing GigaSpeech JSON entries'
             ):
                 recordings.append(recording)

From 781ff9283acf892f070bae5b0fdf32eb7184b784 Mon Sep 17 00:00:00 2001
From: Junbo Zhang <zhangjunbo1@xiaomi.com>
Date: Mon, 23 Aug 2021 21:16:49 +0800
Subject: [PATCH 5/7] Fix the invalid variable `GIGASPEECH_PARTS` in CI

---
 lhotse/recipes/gigaspeech.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
index d3dc281c4..229dbde7e 100644
--- a/lhotse/recipes/gigaspeech.py
+++ b/lhotse/recipes/gigaspeech.py
@@ -21,6 +21,9 @@
 from lhotse.utils import Pathlike, Seconds, is_module_available
 
 
+GIGASPEECH_PARTS = ('XS', 'S', 'M', 'L', 'XL', 'DEV', 'TEST')
+
+
 def download_gigaspeech(
         password: str,
         target_dir: Pathlike = '.',

From 06e0ca753cbc1a9a238467159f486ac2e82167b1 Mon Sep 17 00:00:00 2001
From: Junbo Zhang <zhangjunbo1@xiaomi.com>
Date: Mon, 23 Aug 2021 21:24:04 +0800
Subject: [PATCH 6/7] Update gigaspeech.py

---
 lhotse/recipes/gigaspeech.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
index 229dbde7e..ff78b5cec 100644
--- a/lhotse/recipes/gigaspeech.py
+++ b/lhotse/recipes/gigaspeech.py
@@ -20,8 +20,7 @@
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.utils import Pathlike, Seconds, is_module_available
 
-
-GIGASPEECH_PARTS = ('XS', 'S', 'M', 'L', 'XL', 'DEV', 'TEST')
+GIGASPEECH_PARTS = ('XL', 'L', 'M', 'S', 'XS', 'DEV', 'TEST')
 
 
 def download_gigaspeech(

From e84556b58466ad80fd0bfb68ced7e6ecad31cc03 Mon Sep 17 00:00:00 2001
From: Junbo Zhang <zhangjunbo1@xiaomi.com>
Date: Mon, 23 Aug 2021 21:31:53 +0800
Subject: [PATCH 7/7] tor making less modifications

---
 lhotse/recipes/gigaspeech.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
index ff78b5cec..698775090 100644
--- a/lhotse/recipes/gigaspeech.py
+++ b/lhotse/recipes/gigaspeech.py
@@ -36,11 +36,12 @@ def download_gigaspeech(
             'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')
     gigaspeech = GigaSpeech(target_dir)
 
-    subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts
-    if isinstance(subsets, str):
-        subsets = [subsets]
+    if dataset_parts == 'auto':
+        dataset_parts = ('XL', 'DEV', 'TEST')
+    elif isinstance(dataset_parts, str):
+        dataset_parts = [dataset_parts]
 
-    for part in subsets:
+    for part in dataset_parts:
         logging.info(f'Downloading GigaSpeech part: {part}')
         gigaspeech.download(password, '{' + part + '}', host=host)