lhotse-speech · pzelasko · May 29, 2024 · Dec 11, 2023 · Dec 20, 2023 · May 2, 2024
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -165,6 +165,8 @@ a CLI tool that create the manifests given a corpus directory.
  - :func:`lhotse.recipes.prepare_nsc`
  * - People's Speech
  - :func:`lhotse.recipes.prepare_peoples_speech`
+ * - ReazonSpeech
+ - :func:`lhotse.recipes.prepare_reazonspeech`
  * - RIRs and Noises Corpus (OpenSLR 28)
  - :func:`lhotse.recipes.prepare_rir_noise`
  * - Speech Commands

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -63,6 +63,7 @@
 from .nsc import *
 from .peoples_speech import *
 from .primewords import *
+from .reazonspeech import *
 from .rir_noise import *
 from .slu import *
 from .speechcommands import *

diff --git a/lhotse/bin/modes/recipes/reazonspeech.py b/lhotse/bin/modes/recipes/reazonspeech.py
@@ -0,0 +1,52 @@
+import logging
+from typing import List
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.reazonspeech import (
+ REAZONSPEECH,
+ download_reazonspeech,
+ prepare_reazonspeech,
+)
+from lhotse.utils import Pathlike
+
+__all__ = ["reazonspeech"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+ "-j",
+ "--num-jobs",
+ type=int,
+ default=1,
+ help="How many threads to use (can give good speed-ups with slow disks).",
+)
+def reazonspeech(
+ corpus_dir: Pathlike,
+ output_dir: Pathlike,
+ num_jobs: int,
+):
+ """ReazonSpeech ASR data preparation."""
+ logging.basicConfig(level=logging.INFO)
+ prepare_reazonspeech(corpus_dir, output_dir=output_dir, num_jobs=num_jobs)
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+ "--subset",
+ type=click.Choice(("auto",) + REAZONSPEECH),
+ multiple=True,
+ default=["auto"],
+ help="List of dataset parts to prepare (default: small-v1). To prepare multiple parts, pass each with `--subset` "
+ "Example: `--subset all",
+)
+def reazonspeech(target_dir: Pathlike, subset: List[str]):
+ """ReazonSpeech download."""
+ logging.basicConfig(level=logging.INFO)
+ if "auto" in subset:
+ subset = "auto"
+ download_reazonspeech(target_dir, dataset_parts=subset)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -64,6 +64,7 @@
 from .musan import download_musan, prepare_musan
 from .nsc import prepare_nsc
 from .peoples_speech import prepare_peoples_speech
+from .reazonspeech import download_reazonspeech, prepare_reazonspeech
 from .rir_noise import download_rir_noise, prepare_rir_noise
 from .slu import prepare_slu
 from .speechcommands import download_speechcommands, prepare_speechcommands
@@ -180,6 +181,8 @@
  "prepare_musan",
  "prepare_nsc",
  "prepare_peoples_speech",
+ "download_reazonspeech",
+ "prepare_reazonspeech",
  "download_rir_noise",
  "prepare_rir_noise",
  "prepare_slu",

diff --git a/lhotse/recipes/reazonspeech.py b/lhotse/recipes/reazonspeech.py
@@ -0,0 +1,243 @@
+"""
+ReazonSpeech is an open-source dataset that contains a diverse set of natural Japanese speech,
+collected from terrestrial television streams. It contains more than 35,000 hours of audio.
+
+The dataset is available on Hugging Face. For more details, please visit:
+
+Dataset: https://huggingface.co/datasets/reazon-research/reazonspeech
+Paper: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
+"""
+
+import json
+import logging
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Tuple, Union
+
+from tqdm.auto import tqdm
+
+from lhotse import CutSet, fix_manifests, validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.parallel import parallel_map
+from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, is_module_available
+
+REAZONSPEECH = (
+ "tiny",
+ "small",
+ "medium",
+ "large",
+ "all",
+ "small-v1",
+ "medium-v1",
+ "all-v1",
+)
+
+PUNCTUATIONS = {ord(x): "" for x in "、。「」『』，,？！!!?!?"}
+ZENKAKU = "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ０１２３４５６７８９"
+HANKAKU = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+ZEN2HAN = str.maketrans(ZENKAKU, HANKAKU)
+
+
+def normalize(s):
+ """
+ Convert full-width characters to half-width, and remove punctuations.
+ :param s: str, input string.
+ :return: str, normalized string.
+ """
+ if is_module_available("num2words"):
+ import num2words
+ else:
+ raise ImportError(
+ "To process the ReazonSpeech corpus, please install optional dependency: pip install num2words"
+ )
+ s = s.translate(PUNCTUATIONS).translate(ZEN2HAN)
+ conv = lambda m: num2words.num2words(m.group(0), lang="ja")
+ return re.sub(r"\d+\.?\d*", conv, s)
+
+
+def write_to_json(data, filename):
+ """
+ Writes data to a JSON file.
+ :param data: The data to write.
+ :param filename: The name of the file to write to.
+ """
+
+ with open(filename, "w", encoding="utf-8") as f:
+ json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+def download_reazonspeech(
+ target_dir: Pathlike = ".",
+ dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
+) -> Path:
+ """
+ Download the ReazonSpeech dataset.
+ :param target_dir: Pathlike, the path of the dir to storage the dataset.
+ :param dataset_parts: the parts of the dataset to download (e.g. small, medium, or large).
+ :return: the path to downloaded data and the JSON file.
+ """
+ if is_module_available("datasets"):
+ from datasets import load_dataset
+ else:
+ raise ImportError(
+ "To process the ReazonSpeech corpus, please install optional dependency: pip install datasets"
+ )
+ target_dir = Path(target_dir)
+ target_dir.mkdir(parents=True, exist_ok=True)
+ corpus_dir = target_dir / "ReazonSpeech"
+
+ if dataset_parts == "auto":
+ dataset_parts = ("small-v1",)
+ elif isinstance(dataset_parts, str):
+ dataset_parts = [dataset_parts]
+
+ for part in dataset_parts:
+ logging.info(f"Downloading ReazonSpeech part: {part}")
+ ds = load_dataset(
+ "reazon-research/reazonspeech",
+ part,
+ trust_remote_code=True,
+ cache_dir=corpus_dir,
+ )["train"]
+
+ # Prepare data for JSON export
+ data_for_json = []
+ idx = 0
+ for item in ds:
+ # Calculate the duration of the audio file
+ audio_array = item["audio"]["array"]
+ sampling_rate = item["audio"]["sampling_rate"]
+ duration = len(audio_array) / float(sampling_rate)
+
+ # Create a dictionary for the current record
+ record = {
+ "id": str(idx),
+ "audio_filepath": item["audio"]["path"],
+ "text": normalize(item["transcription"]),
+ "duration": duration,
+ }
+
+ # Append the record to the list
+ data_for_json.append(record)
+ idx += 1
+
+ # Write data to a JSON file
+ write_to_json(data_for_json, corpus_dir / "dataset.json")
+
+ return corpus_dir
+
+
+def prepare_reazonspeech(
+ corpus_dir: Pathlike,
+ output_dir: Optional[Pathlike],
+ num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+ """
+ Returns the manifests which consist of the Recordings and Supervisions.
+ When all the manifests are available in the ``output_dir``, it will simply read and return them.
+ :param corpus_dir: Pathlike, the path of the data dir.
+ :param output_dir: Pathlike, the path where to write the manifests.
+ :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
+ :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
+ """
+ corpus_dir = Path(corpus_dir)
+ assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+ # Split the dataset into train, dev, and test
+ with open(corpus_dir / "dataset.json", "r", encoding="utf-8") as file:
+ full = json.load(file)
+ dev = full[:1000]
+ test = full[1000:1100]
+ train = full[1100:]
+
+ write_to_json(train, corpus_dir / "train.json")
+ write_to_json(dev, corpus_dir / "dev.json")
+ write_to_json(test, corpus_dir / "test.json")
+
+ parts = ("train", "dev", "test")
+
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ # Maybe some manifests already exist: we can read them and save a bit of preparation time.
+ manifests = read_manifests_if_cached(
+ dataset_parts=parts,
+ output_dir=output_dir,
+ prefix="reazonspeech",
+ suffix="jsonl.gz",
+ lazy=True,
+ )
+
+ for part in parts:
+ logging.info(f"Processing ReazonSpeech subset: {part}")
+ if manifests_exist(
+ part=part, output_dir=output_dir, prefix="reazonspeech", suffix="jsonl.gz"
+ ):
+ logging.info(f"ReazonSpeech subset: {part} already prepared - skipping.")
+ continue
+
+ filename = corpus_dir / f"{part}.json"
+ with open(filename, "r", encoding="utf-8") as file:
+ items = json.load(file)
+
+ with RecordingSet.open_writer(
+ output_dir / f"reazonspeech_recordings_{part}.jsonl.gz"
+ ) as rec_writer, SupervisionSet.open_writer(
+ output_dir / f"reazonspeech_supervisions_{part}.jsonl.gz"
+ ) as sup_writer, CutSet.open_writer(
+ output_dir / f"reazonspeech_cuts_{part}.jsonl.gz"
+ ) as cut_writer:
+ for recording, segment in tqdm(
+ parallel_map(
+ parse_utterance,
+ items,
+ num_jobs=num_jobs,
+ ),
+ desc="Processing reazonspeech JSON entries",
+ ):
+ # Fix and validate the recording + supervisions
+ recordings, segments = fix_manifests(
+ recordings=RecordingSet.from_recordings([recording]),
+ supervisions=SupervisionSet.from_segments([segment]),
+ )
+ validate_recordings_and_supervisions(
+ recordings=recordings, supervisions=segments
+ )
+ # Create the cut since most users will need it anyway.
+ # There will be exactly one cut since there's exactly one recording.
+ cuts = CutSet.from_manifests(
+ recordings=recordings, supervisions=segments
+ )
+ # Write the manifests
+ rec_writer.write(recordings[0])
+ sup_writer.write(segments[0])
+ cut_writer.write(cuts[0])
+
+ manifests[part] = {
+ "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
+ "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
+ "cuts": CutSet.from_jsonl_lazy(cut_writer.path),
+ }
+
+ return dict(manifests)
+
+
+def parse_utterance(item: Any) -> Optional[Tuple[Recording, SupervisionSegment]]:
+ """
+ Process a single utterance from the ReazonSpeech dataset.
+ :param item: The utterance to process.
+ :return: A tuple containing the Recording and SupervisionSegment.
+ """
+ recording = Recording.from_file(item["audio_filepath"], recording_id=item["id"])
+ segments = SupervisionSegment(
+ id=item["id"],
+ recording_id=item["id"],
+ start=0.0,
+ duration=item["duration"],
+ channel=0,
+ language="Japanese",
+ text=item["text"],
+ )
+ return recording, segments