lhotse-speech · pzelasko · Aug 23, 2021 · Aug 22, 2021 · Aug 23, 2021 · Aug 23, 2021
diff --git a/examples/gigaspeech/gigaspeech.ipynb b/examples/gigaspeech/gigaspeech.ipynb
@@ -12,13 +12,12 @@
     "from pathlib import Path\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
-    "from speechcolab.datasets.gigaspeech import GigaSpeech\n",
     "\n",
     "from lhotse import CutSet, Fbank, LilcomFilesWriter\n",
-    "from lhotse.augmentation import SoxEffectTransform, RandomValue, pitch, reverb, speed\n",
+    "from lhotse.augmentation import SoxEffectTransform, RandomValue\n",
     "from lhotse.dataset import K2SpeechRecognitionDataset\n",
     "from lhotse.dataset.sampling import SingleCutSampler\n",
-    "from lhotse.recipes.gigaspeech import prepare_gigaspeech"
+    "from lhotse.recipes.gigaspeech import download_gigaspeech, prepare_gigaspeech"
    ]
   },
   {
@@ -56,7 +55,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_set = '{XS}'"
+    "train_set = 'XS'"
    ]
   },
   {
@@ -66,7 +65,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset_parts = (train_set, '{TEST}')"
+    "dataset_parts = (train_set, 'TEST')"
    ]
   },
   {
@@ -84,7 +83,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "gigaspeech = GigaSpeech(corpus_dir)"
+    "password = ''",
+    "# You need to fill out the Google Form to get the password",
+    "# https://forms.gle/UuGQAPyscGRrUMLq6\n"
    ]
   },
   {
@@ -94,12 +95,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for part in dataset_parts:\n",
-    "    # TODO: remove this try-except block in the stable version\n",
-    "    try:\n",
-    "        gigaspeech.download(part)\n",
-    "    except NotImplementedError:\n",
-    "        assert gigaspeech.json_path.is_file()"
+    "download_gigaspeech(password, corpus_dir, dataset_parts)"
    ]
   },
   {
@@ -129,7 +125,7 @@
    },
    "outputs": [],
    "source": [
-    "gigaspeech_manifests = prepare_gigaspeech(gigaspeech, dataset_parts, output_dir, num_jobs=num_jobs)"
+    "gigaspeech_manifests = prepare_gigaspeech(corpus_dir, dataset_parts, output_dir, num_jobs=num_jobs)"
    ]
   },
   {
@@ -208,7 +204,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['{TEST}'])\n",
+    "dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['TEST'])\n",
     "dataset_train = K2SpeechRecognitionDataset(gigaspeech_manifests[train_set])"
    ]
   },

diff --git a/lhotse/bin/modes/recipes/gigaspeech.py b/lhotse/bin/modes/recipes/gigaspeech.py
@@ -29,16 +29,20 @@ def gigaspeech(
 
 
 @download.command(context_settings=dict(show_default=True))
+@click.argument('password', type=str)
 @click.argument('target_dir', type=click.Path())
 @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True,
               default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).')
+@click.option('--host', type=str, default='tsinghua', help='Which host to download Gigaspeech.')
 def gigaspeech(
+        password: str,
         target_dir: Pathlike,
-        subset: List[str]
+        subset: List[str],
+        host: str
 ):
     """Gigaspeech download."""
     # Convert (likely one-element) list with "auto" into a string.
     logging.basicConfig(level=logging.INFO)
     if 'auto' in subset:
         subset = 'auto'
-    download_gigaspeech(target_dir, dataset_parts=subset)
+    download_gigaspeech(password, target_dir, dataset_parts=subset, host=host)
diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py
@@ -24,8 +24,10 @@
 
 
 def download_gigaspeech(
+        password: str,
         target_dir: Pathlike = '.',
         dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
+        host: Optional[str] = 'tsinghua'
 ):
     if is_module_available('speechcolab'):
         from speechcolab.datasets.gigaspeech import GigaSpeech
@@ -41,10 +43,7 @@ def download_gigaspeech(
 
     for part in dataset_parts:
         logging.info(f'Downloading GigaSpeech part: {part}')
-        try:
-            gigaspeech.download('{' + part + '}')
-        except NotImplementedError:
-            raise ValueError(f"Could not download GigaSpeech part {part} -- speechcolab raised NotImplementedError.")
+        gigaspeech.download(password, '{' + part + '}', host=host)
 
 
 def prepare_gigaspeech(
@@ -93,7 +92,7 @@ def prepare_gigaspeech(
             recordings = []
             supervisions = []
             for recording, segments in tqdm(
-                    ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.root_path)),
+                    ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.gigaspeech_dataset_dir)),
                     desc='Processing GigaSpeech JSON entries'
             ):
                 recordings.append(recording)