Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ReazonSpeech recipe #1330

Merged
merged 12 commits into from
May 29, 2024
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
from .nsc import *
from .peoples_speech import *
from .primewords import *
from .reazonspeech import *
from .rir_noise import *
from .slu import *
from .speechcommands import *
Expand Down
15 changes: 15 additions & 0 deletions lhotse/bin/modes/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import click

from lhotse.bin.modes import prepare
from lhotse.recipes.reazonspeech import prepare_reazonspeech
from lhotse.utils import Pathlike

__all__ = ["reazonspeech"]


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
def reazonspeech(corpus_dir: Pathlike, output_dir: Pathlike):
"""ReazonSpeech data preparation."""
prepare_reazonspeech(corpus_dir, output_dir=output_dir)
2 changes: 2 additions & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from .musan import download_musan, prepare_musan
from .nsc import prepare_nsc
from .peoples_speech import prepare_peoples_speech
from .reazonspeech import prepare_reazonspeech
from .rir_noise import download_rir_noise, prepare_rir_noise
from .slu import prepare_slu
from .speechcommands import download_speechcommands, prepare_speechcommands
Expand Down Expand Up @@ -180,6 +181,7 @@
"prepare_musan",
"prepare_nsc",
"prepare_peoples_speech",
"prepare_reazonspeech",
"download_rir_noise",
"prepare_rir_noise",
"prepare_slu",
Expand Down
69 changes: 69 additions & 0 deletions lhotse/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import json
import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, Optional, Union

from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.qa import fix_manifests
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike


def prepare_reazonspeech(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since reazonspeech is a large dataset this method may require a lot of cpu memory before it writes anything. Would you consider modifying the recipe to resemble gigaspeech more closely? It writes examples as it processes them for reduced memory usage https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/gigaspeech.py

corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:

corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
manifests = defaultdict(dict)

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

idx = 0
for part in ["train", "valid", "test"]:
recordings = []
supervisions = []
with open("%s/%s.json" % (corpus_dir, part)) as fp:
for line in fp:
line = line.strip()
if not line:
continue
item = json.loads(line)
recordings.append(
Recording.from_file(item["audio_filepath"], recording_id=str(idx))
)
supervisions.append(
SupervisionSegment(
id=str(idx),
recording_id=str(idx),
start=0.0,
duration=item["duration"],
channel=0,
language="Japanese",
speaker=str(idx),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If speaker information is not available, just omit this field.

text=item["text"],
)
)
idx += 1

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)
recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
validate_recordings_and_supervisions(recording_set, supervision_set)

if output_dir is not None:
supervision_set.to_file(
output_dir / f"reazonspeech_supervisions_{part}.jsonl.gz"
)
recording_set.to_file(
output_dir / f"reazonspeech_recordings_{part}.jsonl.gz"
)

manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}

return manifests