From c12afc5afb0569b9d419d913cc60664665cf6ef3 Mon Sep 17 00:00:00 2001 From: Junbo Zhang Date: Sun, 22 Aug 2021 10:08:56 +0800 Subject: [PATCH 1/7] update GigaSpeech downloading --- examples/gigaspeech/gigaspeech.ipynb | 9 ++++----- lhotse/bin/modes/recipes/gigaspeech.py | 4 +++- lhotse/recipes/gigaspeech.py | 6 ++---- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/gigaspeech/gigaspeech.ipynb b/examples/gigaspeech/gigaspeech.ipynb index 1d6e3731d..353335b30 100644 --- a/examples/gigaspeech/gigaspeech.ipynb +++ b/examples/gigaspeech/gigaspeech.ipynb @@ -94,12 +94,11 @@ "metadata": {}, "outputs": [], "source": [ + "password = ''", + "# You need to fill out the Google Form to get the password", + "# https://forms.gle/UuGQAPyscGRrUMLq6\n", "for part in dataset_parts:\n", - " # TODO: remove this try-except block in the stable version\n", - " try:\n", - " gigaspeech.download(part)\n", - " except NotImplementedError:\n", - " assert gigaspeech.json_path.is_file()" + " gigaspeech.download(password, part)" ] }, { diff --git a/lhotse/bin/modes/recipes/gigaspeech.py b/lhotse/bin/modes/recipes/gigaspeech.py index 25ffcf91e..297c771ab 100644 --- a/lhotse/bin/modes/recipes/gigaspeech.py +++ b/lhotse/bin/modes/recipes/gigaspeech.py @@ -29,10 +29,12 @@ def gigaspeech( @download.command(context_settings=dict(show_default=True)) +@click.argument('password', type=str) @click.argument('target_dir', type=click.Path()) @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True, default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).') def gigaspeech( + password: str, target_dir: Pathlike, subset: List[str] ): @@ -41,4 +43,4 @@ def gigaspeech( logging.basicConfig(level=logging.INFO) if 'auto' in subset: subset = 'auto' - download_gigaspeech(target_dir, dataset_parts=subset) + download_gigaspeech(password, target_dir, dataset_parts=subset) diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py index cb05fc312..134c3da6a 100644 --- a/lhotse/recipes/gigaspeech.py +++ b/lhotse/recipes/gigaspeech.py @@ -24,6 +24,7 @@ def download_gigaspeech( + password: str, target_dir: Pathlike = '.', dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", ): @@ -41,10 +42,7 @@ def download_gigaspeech( for part in dataset_parts: logging.info(f'Downloading GigaSpeech part: {part}') - try: - gigaspeech.download('{' + part + '}') - except NotImplementedError: - raise ValueError(f"Could not download GigaSpeech part {part} -- speechcolab raised NotImplementedError.") + gigaspeech.download(password, '{' + part + '}') def prepare_gigaspeech( From 9ea89323e9b1b28edf133766bfccebb7f81db2d9 Mon Sep 17 00:00:00 2001 From: Junbo Zhang Date: Mon, 23 Aug 2021 08:51:16 +0800 Subject: [PATCH 2/7] update the gigaspeech recipe --- examples/gigaspeech/gigaspeech.ipynb | 17 +++++++---------- lhotse/bin/modes/recipes/gigaspeech.py | 10 ++++++---- lhotse/recipes/gigaspeech.py | 16 +++++++--------- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/examples/gigaspeech/gigaspeech.ipynb b/examples/gigaspeech/gigaspeech.ipynb index 353335b30..e54adbd1e 100644 --- a/examples/gigaspeech/gigaspeech.ipynb +++ b/examples/gigaspeech/gigaspeech.ipynb @@ -12,13 +12,12 @@ "from pathlib import Path\n", "\n", "import matplotlib.pyplot as plt\n", - "from speechcolab.datasets.gigaspeech import GigaSpeech\n", "\n", "from lhotse import CutSet, Fbank, LilcomFilesWriter\n", - "from lhotse.augmentation import SoxEffectTransform, RandomValue, pitch, reverb, speed\n", + "from lhotse.augmentation import SoxEffectTransform, RandomValue\n", "from lhotse.dataset import K2SpeechRecognitionDataset\n", "from lhotse.dataset.sampling import SingleCutSampler\n", - "from lhotse.recipes.gigaspeech import prepare_gigaspeech" + "from lhotse.recipes.gigaspeech import download_gigaspeech, prepare_gigaspeech" ] }, { @@ -84,7 +83,9 @@ "metadata": {}, "outputs": [], "source": [ - "gigaspeech = GigaSpeech(corpus_dir)" + "password = ''", + "# You need to fill out the Google Form to get the password", + "# https://forms.gle/UuGQAPyscGRrUMLq6\n" ] }, { @@ -94,11 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "password = ''", - "# You need to fill out the Google Form to get the password", - "# https://forms.gle/UuGQAPyscGRrUMLq6\n", - "for part in dataset_parts:\n", - " gigaspeech.download(password, part)" + "download_gigaspeech(password, corpus_dir, dataset_parts)" ] }, { @@ -128,7 +125,7 @@ }, "outputs": [], "source": [ - "gigaspeech_manifests = prepare_gigaspeech(gigaspeech, dataset_parts, output_dir, num_jobs=num_jobs)" + "gigaspeech_manifests = prepare_gigaspeech(corpus_dir, dataset_parts, output_dir, num_jobs=num_jobs)" ] }, { diff --git a/lhotse/bin/modes/recipes/gigaspeech.py b/lhotse/bin/modes/recipes/gigaspeech.py index 297c771ab..5fd99361a 100644 --- a/lhotse/bin/modes/recipes/gigaspeech.py +++ b/lhotse/bin/modes/recipes/gigaspeech.py @@ -12,7 +12,7 @@ @click.argument('corpus_dir', type=click.Path(exists=True, dir_okay=True)) @click.argument('output_dir', type=click.Path()) @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True, - default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).') + default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).') @click.option('-j', '--num-jobs', type=int, default=1, help='How many threads to use (can give good speed-ups with slow disks).') def gigaspeech( @@ -32,15 +32,17 @@ def gigaspeech( @click.argument('password', type=str) @click.argument('target_dir', type=click.Path()) @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True, - default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).') + default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).') +@click.option('--host', type=str, default='tsinghua', help='Which host to download Gigaspeech.') def gigaspeech( password: str, target_dir: Pathlike, - subset: List[str] + subset: List[str], + host: str ): """Gigaspeech download.""" # Convert (likely one-element) list with "auto" into a string. logging.basicConfig(level=logging.INFO) if 'auto' in subset: subset = 'auto' - download_gigaspeech(password, target_dir, dataset_parts=subset) + download_gigaspeech(password, target_dir, dataset_parts=subset, host=host) diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py index 134c3da6a..90f7f89cb 100644 --- a/lhotse/recipes/gigaspeech.py +++ b/lhotse/recipes/gigaspeech.py @@ -20,13 +20,12 @@ from lhotse.supervision import SupervisionSegment, SupervisionSet from lhotse.utils import Pathlike, Seconds, is_module_available -GIGASPEECH_PARTS = ('XL', 'L', 'M', 'S', 'XS', 'DEV', 'TEST') - def download_gigaspeech( password: str, target_dir: Pathlike = '.', dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", + host: Optional[str] = 'tsinghua' ): if is_module_available('speechcolab'): from speechcolab.datasets.gigaspeech import GigaSpeech @@ -35,14 +34,13 @@ def download_gigaspeech( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') gigaspeech = GigaSpeech(target_dir) - if dataset_parts == 'auto': - dataset_parts = ('XL', 'DEV', 'TEST') - elif isinstance(dataset_parts, str): - dataset_parts = [dataset_parts] + subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts + if isinstance(subsets, str): + subsets = [subsets] - for part in dataset_parts: + for part in subsets: logging.info(f'Downloading GigaSpeech part: {part}') - gigaspeech.download(password, '{' + part + '}') + gigaspeech.download(password, part, host=host) def prepare_gigaspeech( @@ -57,7 +55,7 @@ def prepare_gigaspeech( raise ImportError( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') - subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts + subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts if isinstance(subsets, str): subsets = [subsets] corpus_dir = Path(corpus_dir) From b4912feea0fdf31bc57598ea7d81ba3354f6ce0c Mon Sep 17 00:00:00 2001 From: Junbo Zhang Date: Mon, 23 Aug 2021 09:00:49 +0800 Subject: [PATCH 3/7] update `prepare_gigaspeech` --- lhotse/recipes/gigaspeech.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py index 90f7f89cb..1ceee2342 100644 --- a/lhotse/recipes/gigaspeech.py +++ b/lhotse/recipes/gigaspeech.py @@ -89,7 +89,7 @@ def prepare_gigaspeech( recordings = [] supervisions = [] for recording, segments in tqdm( - ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.root_path)), + ex.map(parse_utterance, gigaspeech.audios(part), repeat(gigaspeech.gigaspeech_dataset_dir)), desc='Processing GigaSpeech JSON entries' ): recordings.append(recording) From 939bab9b761ef820a826b43e95a0070f9739b3cd Mon Sep 17 00:00:00 2001 From: Junbo Zhang Date: Mon, 23 Aug 2021 16:33:31 +0800 Subject: [PATCH 4/7] Remove brackets from subset names --- examples/gigaspeech/gigaspeech.ipynb | 6 +++--- lhotse/bin/modes/recipes/gigaspeech.py | 4 ++-- lhotse/recipes/gigaspeech.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/gigaspeech/gigaspeech.ipynb b/examples/gigaspeech/gigaspeech.ipynb index e54adbd1e..2081ac7b3 100644 --- a/examples/gigaspeech/gigaspeech.ipynb +++ b/examples/gigaspeech/gigaspeech.ipynb @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_set = '{XS}'" + "train_set = 'XS'" ] }, { @@ -65,7 +65,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataset_parts = (train_set, '{TEST}')" + "dataset_parts = (train_set, 'TEST')" ] }, { @@ -204,7 +204,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['{TEST}'])\n", + "dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['TEST'])\n", "dataset_train = K2SpeechRecognitionDataset(gigaspeech_manifests[train_set])" ] }, diff --git a/lhotse/bin/modes/recipes/gigaspeech.py b/lhotse/bin/modes/recipes/gigaspeech.py index 5fd99361a..e0e733283 100644 --- a/lhotse/bin/modes/recipes/gigaspeech.py +++ b/lhotse/bin/modes/recipes/gigaspeech.py @@ -12,7 +12,7 @@ @click.argument('corpus_dir', type=click.Path(exists=True, dir_okay=True)) @click.argument('output_dir', type=click.Path()) @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True, - default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).') + default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).') @click.option('-j', '--num-jobs', type=int, default=1, help='How many threads to use (can give good speed-ups with slow disks).') def gigaspeech( @@ -32,7 +32,7 @@ def gigaspeech( @click.argument('password', type=str) @click.argument('target_dir', type=click.Path()) @click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True, - default=['auto'], help='Which parts of Gigaspeech to download (by default {XL} + {DEV} + {TEST}).') + default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).') @click.option('--host', type=str, default='tsinghua', help='Which host to download Gigaspeech.') def gigaspeech( password: str, diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py index 1ceee2342..d3dc281c4 100644 --- a/lhotse/recipes/gigaspeech.py +++ b/lhotse/recipes/gigaspeech.py @@ -34,13 +34,13 @@ def download_gigaspeech( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') gigaspeech = GigaSpeech(target_dir) - subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts + subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts if isinstance(subsets, str): subsets = [subsets] for part in subsets: logging.info(f'Downloading GigaSpeech part: {part}') - gigaspeech.download(password, part, host=host) + gigaspeech.download(password, '{' + part + '}', host=host) def prepare_gigaspeech( @@ -55,7 +55,7 @@ def prepare_gigaspeech( raise ImportError( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') - subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts + subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts if isinstance(subsets, str): subsets = [subsets] corpus_dir = Path(corpus_dir) @@ -89,7 +89,7 @@ def prepare_gigaspeech( recordings = [] supervisions = [] for recording, segments in tqdm( - ex.map(parse_utterance, gigaspeech.audios(part), repeat(gigaspeech.gigaspeech_dataset_dir)), + ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.gigaspeech_dataset_dir)), desc='Processing GigaSpeech JSON entries' ): recordings.append(recording) From 781ff9283acf892f070bae5b0fdf32eb7184b784 Mon Sep 17 00:00:00 2001 From: Junbo Zhang Date: Mon, 23 Aug 2021 21:16:49 +0800 Subject: [PATCH 5/7] Fix the invalid variable `GIGASPEECH_PARTS` in CI --- lhotse/recipes/gigaspeech.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py index d3dc281c4..229dbde7e 100644 --- a/lhotse/recipes/gigaspeech.py +++ b/lhotse/recipes/gigaspeech.py @@ -21,6 +21,9 @@ from lhotse.utils import Pathlike, Seconds, is_module_available +GIGASPEECH_PARTS = ('XS', 'S', 'M', 'L', 'XL', 'DEV', 'TEST') + + def download_gigaspeech( password: str, target_dir: Pathlike = '.', From 06e0ca753cbc1a9a238467159f486ac2e82167b1 Mon Sep 17 00:00:00 2001 From: Junbo Zhang Date: Mon, 23 Aug 2021 21:24:04 +0800 Subject: [PATCH 6/7] Update gigaspeech.py --- lhotse/recipes/gigaspeech.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py index 229dbde7e..ff78b5cec 100644 --- a/lhotse/recipes/gigaspeech.py +++ b/lhotse/recipes/gigaspeech.py @@ -20,8 +20,7 @@ from lhotse.supervision import SupervisionSegment, SupervisionSet from lhotse.utils import Pathlike, Seconds, is_module_available - -GIGASPEECH_PARTS = ('XS', 'S', 'M', 'L', 'XL', 'DEV', 'TEST') +GIGASPEECH_PARTS = ('XL', 'L', 'M', 'S', 'XS', 'DEV', 'TEST') def download_gigaspeech( From e84556b58466ad80fd0bfb68ced7e6ecad31cc03 Mon Sep 17 00:00:00 2001 From: Junbo Zhang Date: Mon, 23 Aug 2021 21:31:53 +0800 Subject: [PATCH 7/7] tor making less modifications --- lhotse/recipes/gigaspeech.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lhotse/recipes/gigaspeech.py b/lhotse/recipes/gigaspeech.py index ff78b5cec..698775090 100644 --- a/lhotse/recipes/gigaspeech.py +++ b/lhotse/recipes/gigaspeech.py @@ -36,11 +36,12 @@ def download_gigaspeech( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') gigaspeech = GigaSpeech(target_dir) - subsets = ('XL', 'DEV', 'TEST') if dataset_parts == 'auto' else dataset_parts - if isinstance(subsets, str): - subsets = [subsets] + if dataset_parts == 'auto': + dataset_parts = ('XL', 'DEV', 'TEST') + elif isinstance(dataset_parts, str): + dataset_parts = [dataset_parts] - for part in subsets: + for part in dataset_parts: logging.info(f'Downloading GigaSpeech part: {part}') gigaspeech.download(password, '{' + part + '}', host=host)