Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Downloading GigaSpeech with PySpeechColab #381

Merged
merged 8 commits into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 10 additions & 14 deletions examples/gigaspeech/gigaspeech.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@
"from pathlib import Path\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from speechcolab.datasets.gigaspeech import GigaSpeech\n",
"\n",
"from lhotse import CutSet, Fbank, LilcomFilesWriter\n",
"from lhotse.augmentation import SoxEffectTransform, RandomValue, pitch, reverb, speed\n",
"from lhotse.augmentation import SoxEffectTransform, RandomValue\n",
"from lhotse.dataset import K2SpeechRecognitionDataset\n",
"from lhotse.dataset.sampling import SingleCutSampler\n",
"from lhotse.recipes.gigaspeech import prepare_gigaspeech"
"from lhotse.recipes.gigaspeech import download_gigaspeech, prepare_gigaspeech"
]
},
{
Expand Down Expand Up @@ -56,7 +55,7 @@
"metadata": {},
"outputs": [],
"source": [
"train_set = '{XS}'"
"train_set = 'XS'"
]
},
{
Expand All @@ -66,7 +65,7 @@
"metadata": {},
"outputs": [],
"source": [
"dataset_parts = (train_set, '{TEST}')"
"dataset_parts = (train_set, 'TEST')"
]
},
{
Expand All @@ -84,7 +83,9 @@
"metadata": {},
"outputs": [],
"source": [
"gigaspeech = GigaSpeech(corpus_dir)"
"password = ''",
"# You need to fill out the Google Form to get the password",
"# https://forms.gle/UuGQAPyscGRrUMLq6\n"
]
},
{
Expand All @@ -94,12 +95,7 @@
"metadata": {},
"outputs": [],
"source": [
"for part in dataset_parts:\n",
" # TODO: remove this try-except block in the stable version\n",
" try:\n",
" gigaspeech.download(part)\n",
" except NotImplementedError:\n",
" assert gigaspeech.json_path.is_file()"
"download_gigaspeech(password, corpus_dir, dataset_parts)"
]
},
{
Expand Down Expand Up @@ -129,7 +125,7 @@
},
"outputs": [],
"source": [
"gigaspeech_manifests = prepare_gigaspeech(gigaspeech, dataset_parts, output_dir, num_jobs=num_jobs)"
"gigaspeech_manifests = prepare_gigaspeech(corpus_dir, dataset_parts, output_dir, num_jobs=num_jobs)"
]
},
{
Expand Down Expand Up @@ -208,7 +204,7 @@
"metadata": {},
"outputs": [],
"source": [
"dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['{TEST}'])\n",
"dataset_test = K2SpeechRecognitionDataset(gigaspeech_manifests['TEST'])\n",
"dataset_train = K2SpeechRecognitionDataset(gigaspeech_manifests[train_set])"
]
},
Expand Down
8 changes: 6 additions & 2 deletions lhotse/bin/modes/recipes/gigaspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,20 @@ def gigaspeech(


@download.command(context_settings=dict(show_default=True))
@click.argument('password', type=str)
@click.argument('target_dir', type=click.Path())
@click.option('--subset', type=click.Choice(('auto',) + GIGASPEECH_PARTS), multiple=True,
default=['auto'], help='Which parts of Gigaspeech to download (by default XL + DEV + TEST).')
@click.option('--host', type=str, default='tsinghua', help='Which host to download Gigaspeech.')
def gigaspeech(
password: str,
target_dir: Pathlike,
subset: List[str]
subset: List[str],
host: str
):
"""Gigaspeech download."""
# Convert (likely one-element) list with "auto" into a string.
logging.basicConfig(level=logging.INFO)
if 'auto' in subset:
subset = 'auto'
download_gigaspeech(target_dir, dataset_parts=subset)
download_gigaspeech(password, target_dir, dataset_parts=subset, host=host)
9 changes: 4 additions & 5 deletions lhotse/recipes/gigaspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@


def download_gigaspeech(
password: str,
target_dir: Pathlike = '.',
dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
host: Optional[str] = 'tsinghua'
):
if is_module_available('speechcolab'):
from speechcolab.datasets.gigaspeech import GigaSpeech
Expand All @@ -41,10 +43,7 @@ def download_gigaspeech(

for part in dataset_parts:
logging.info(f'Downloading GigaSpeech part: {part}')
try:
gigaspeech.download('{' + part + '}')
except NotImplementedError:
raise ValueError(f"Could not download GigaSpeech part {part} -- speechcolab raised NotImplementedError.")
gigaspeech.download(password, '{' + part + '}', host=host)


def prepare_gigaspeech(
Expand Down Expand Up @@ -93,7 +92,7 @@ def prepare_gigaspeech(
recordings = []
supervisions = []
for recording, segments in tqdm(
ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.root_path)),
ex.map(parse_utterance, gigaspeech.audios('{' + part + '}'), repeat(gigaspeech.gigaspeech_dataset_dir)),
desc='Processing GigaSpeech JSON entries'
):
recordings.append(recording)
Expand Down