Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ReazonSpeech recipe #1330

Merged
merged 12 commits into from
May 29, 2024
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_nsc`
* - People's Speech
- :func:`lhotse.recipes.prepare_peoples_speech`
* - ReazonSpeech
- :func:`lhotse.recipes.prepare_reazonspeech`
* - RIRs and Noises Corpus (OpenSLR 28)
- :func:`lhotse.recipes.prepare_rir_noise`
* - Speech Commands
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
from .nsc import *
from .peoples_speech import *
from .primewords import *
from .reazonspeech import *
from .rir_noise import *
from .slu import *
from .speechcommands import *
Expand Down
52 changes: 52 additions & 0 deletions lhotse/bin/modes/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import logging
from typing import List

import click

from lhotse.bin.modes import download, prepare
from lhotse.recipes.reazonspeech import (
REAZONSPEECH,
download_reazonspeech,
prepare_reazonspeech,
)
from lhotse.utils import Pathlike

__all__ = ["reazonspeech"]


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
def reazonspeech(
corpus_dir: Pathlike,
output_dir: Pathlike,
num_jobs: int,
):
"""ReazonSpeech ASR data preparation."""
logging.basicConfig(level=logging.INFO)
prepare_reazonspeech(corpus_dir, output_dir=output_dir, num_jobs=num_jobs)


@download.command(context_settings=dict(show_default=True))
@click.argument("target_dir", type=click.Path())
@click.option(
"--subset",
type=click.Choice(("auto",) + REAZONSPEECH),
multiple=True,
default=["auto"],
help="List of dataset parts to prepare (default: small-v1). To prepare multiple parts, pass each with `--subset` "
"Example: `--subset all",
)
def reazonspeech(target_dir: Pathlike, subset: List[str]):
"""ReazonSpeech download."""
logging.basicConfig(level=logging.INFO)
if "auto" in subset:
subset = "auto"
download_reazonspeech(target_dir, dataset_parts=subset)
3 changes: 3 additions & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from .musan import download_musan, prepare_musan
from .nsc import prepare_nsc
from .peoples_speech import prepare_peoples_speech
from .reazonspeech import download_reazonspeech, prepare_reazonspeech
from .rir_noise import download_rir_noise, prepare_rir_noise
from .slu import prepare_slu
from .speechcommands import download_speechcommands, prepare_speechcommands
Expand Down Expand Up @@ -180,6 +181,8 @@
"prepare_musan",
"prepare_nsc",
"prepare_peoples_speech",
"download_reazonspeech",
"prepare_reazonspeech",
"download_rir_noise",
"prepare_rir_noise",
"prepare_slu",
Expand Down
243 changes: 243 additions & 0 deletions lhotse/recipes/reazonspeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
"""
ReazonSpeech is an open-source dataset that contains a diverse set of natural Japanese speech,
collected from terrestrial television streams. It contains more than 35,000 hours of audio.

The dataset is available on Hugging Face. For more details, please visit:

Dataset: https://huggingface.co/datasets/reazon-research/reazonspeech
Paper: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
"""

import json
import logging
import re
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, Optional, Sequence, Tuple, Union

from tqdm.auto import tqdm

from lhotse import CutSet, fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.parallel import parallel_map
from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, is_module_available

REAZONSPEECH = (
"tiny",
"small",
"medium",
"large",
"all",
"small-v1",
"medium-v1",
"all-v1",
)

PUNCTUATIONS = {ord(x): "" for x in "、。「」『』,,?!!!?!?"}
ZENKAKU = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
HANKAKU = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
ZEN2HAN = str.maketrans(ZENKAKU, HANKAKU)


def normalize(s):
"""
Convert full-width characters to half-width, and remove punctuations.
:param s: str, input string.
:return: str, normalized string.
"""
if is_module_available("num2words"):
import num2words
else:
raise ImportError(
"To process the ReazonSpeech corpus, please install optional dependency: pip install num2words"
)
s = s.translate(PUNCTUATIONS).translate(ZEN2HAN)
conv = lambda m: num2words.num2words(m.group(0), lang="ja")
return re.sub(r"\d+\.?\d*", conv, s)


def write_to_json(data, filename):
"""
Writes data to a JSON file.
:param data: The data to write.
:param filename: The name of the file to write to.
"""

with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)


def download_reazonspeech(
target_dir: Pathlike = ".",
dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
) -> Path:
"""
Download the ReazonSpeech dataset.
:param target_dir: Pathlike, the path of the dir to storage the dataset.
:param dataset_parts: the parts of the dataset to download (e.g. small, medium, or large).
:return: the path to downloaded data and the JSON file.
"""
if is_module_available("datasets"):
from datasets import load_dataset
else:
raise ImportError(
"To process the ReazonSpeech corpus, please install optional dependency: pip install datasets"
)
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
corpus_dir = target_dir / "ReazonSpeech"

if dataset_parts == "auto":
dataset_parts = ("small-v1",)
elif isinstance(dataset_parts, str):
dataset_parts = [dataset_parts]

for part in dataset_parts:
logging.info(f"Downloading ReazonSpeech part: {part}")
ds = load_dataset(
"reazon-research/reazonspeech",
part,
trust_remote_code=True,
cache_dir=corpus_dir,
)["train"]

# Prepare data for JSON export
data_for_json = []
idx = 0
for item in ds:
# Calculate the duration of the audio file
audio_array = item["audio"]["array"]
sampling_rate = item["audio"]["sampling_rate"]
duration = len(audio_array) / float(sampling_rate)

# Create a dictionary for the current record
record = {
"id": str(idx),
"audio_filepath": item["audio"]["path"],
"text": normalize(item["transcription"]),
"duration": duration,
}

# Append the record to the list
data_for_json.append(record)
idx += 1

# Write data to a JSON file
write_to_json(data_for_json, corpus_dir / "dataset.json")

return corpus_dir


def prepare_reazonspeech(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since reazonspeech is a large dataset this method may require a lot of cpu memory before it writes anything. Would you consider modifying the recipe to resemble gigaspeech more closely? It writes examples as it processes them for reduced memory usage https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/gigaspeech.py

corpus_dir: Pathlike,
output_dir: Optional[Pathlike],
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions.
When all the manifests are available in the ``output_dir``, it will simply read and return them.
:param corpus_dir: Pathlike, the path of the data dir.
:param output_dir: Pathlike, the path where to write the manifests.
:param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

# Split the dataset into train, dev, and test
with open(corpus_dir / "dataset.json", "r", encoding="utf-8") as file:
full = json.load(file)
dev = full[:1000]
test = full[1000:1100]
train = full[1100:]

write_to_json(train, corpus_dir / "train.json")
write_to_json(dev, corpus_dir / "dev.json")
write_to_json(test, corpus_dir / "test.json")

parts = ("train", "dev", "test")

output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Maybe some manifests already exist: we can read them and save a bit of preparation time.
manifests = read_manifests_if_cached(
dataset_parts=parts,
output_dir=output_dir,
prefix="reazonspeech",
suffix="jsonl.gz",
lazy=True,
)

for part in parts:
logging.info(f"Processing ReazonSpeech subset: {part}")
if manifests_exist(
part=part, output_dir=output_dir, prefix="reazonspeech", suffix="jsonl.gz"
):
logging.info(f"ReazonSpeech subset: {part} already prepared - skipping.")
continue

filename = corpus_dir / f"{part}.json"
with open(filename, "r", encoding="utf-8") as file:
items = json.load(file)

with RecordingSet.open_writer(
output_dir / f"reazonspeech_recordings_{part}.jsonl.gz"
) as rec_writer, SupervisionSet.open_writer(
output_dir / f"reazonspeech_supervisions_{part}.jsonl.gz"
) as sup_writer, CutSet.open_writer(
output_dir / f"reazonspeech_cuts_{part}.jsonl.gz"
) as cut_writer:
for recording, segment in tqdm(
parallel_map(
parse_utterance,
items,
num_jobs=num_jobs,
),
desc="Processing reazonspeech JSON entries",
):
# Fix and validate the recording + supervisions
recordings, segments = fix_manifests(
recordings=RecordingSet.from_recordings([recording]),
supervisions=SupervisionSet.from_segments([segment]),
)
validate_recordings_and_supervisions(
recordings=recordings, supervisions=segments
)
# Create the cut since most users will need it anyway.
# There will be exactly one cut since there's exactly one recording.
cuts = CutSet.from_manifests(
recordings=recordings, supervisions=segments
)
# Write the manifests
rec_writer.write(recordings[0])
sup_writer.write(segments[0])
cut_writer.write(cuts[0])

manifests[part] = {
"recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
"supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
"cuts": CutSet.from_jsonl_lazy(cut_writer.path),
}

return dict(manifests)


def parse_utterance(item: Any) -> Optional[Tuple[Recording, SupervisionSegment]]:
"""
Process a single utterance from the ReazonSpeech dataset.
:param item: The utterance to process.
:return: A tuple containing the Recording and SupervisionSegment.
"""
recording = Recording.from_file(item["audio_filepath"], recording_id=item["id"])
segments = SupervisionSegment(
id=item["id"],
recording_id=item["id"],
start=0.0,
duration=item["duration"],
channel=0,
language="Japanese",
text=item["text"],
)
return recording, segments
Loading