Add the ReazonSpeech recipe (#1330)

* Add stub ReazonSpeech recipe I created this recipe by copying "aishell4" recipe, and stripping the most of the contents. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * Format the script with black to meet style guidelines * Add ReazonSpeech to the dataset table * Add a download method and refactor the prepare function * Fix the TypeError when download the subset * Format to follow the code style * Change to local import * Format to follow the code style --------- Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> Co-authored-by: Fujimoto Seiji <fujimoto@ceptord.net> Co-authored-by: Chen <qc@KDM00.cm.cluster> Co-authored-by: root <root@KDA01.cm.cluster>
lhotse-speech · May 29, 2024 · c778520 · c778520
1 parent 26c3911
commit c778520
Show file tree

Hide file tree

Showing 5 changed files with 301 additions and 0 deletions.
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -165,6 +165,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_nsc`
   * - People's Speech
     - :func:`lhotse.recipes.prepare_peoples_speech`
+  * - ReazonSpeech
+    - :func:`lhotse.recipes.prepare_reazonspeech`
   * - RIRs and Noises Corpus (OpenSLR 28)
     - :func:`lhotse.recipes.prepare_rir_noise`
   * - Speech Commands

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -63,6 +63,7 @@
 from .nsc import *
 from .peoples_speech import *
 from .primewords import *
+from .reazonspeech import *
 from .rir_noise import *
 from .slu import *
 from .speechcommands import *

diff --git a/lhotse/bin/modes/recipes/reazonspeech.py b/lhotse/bin/modes/recipes/reazonspeech.py
@@ -0,0 +1,52 @@
+import logging
+from typing import List
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.reazonspeech import (
+    REAZONSPEECH,
+    download_reazonspeech,
+    prepare_reazonspeech,
+)
+from lhotse.utils import Pathlike
+
+__all__ = ["reazonspeech"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+def reazonspeech(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    num_jobs: int,
+):
+    """ReazonSpeech ASR data preparation."""
+    logging.basicConfig(level=logging.INFO)
+    prepare_reazonspeech(corpus_dir, output_dir=output_dir, num_jobs=num_jobs)
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "--subset",
+    type=click.Choice(("auto",) + REAZONSPEECH),
+    multiple=True,
+    default=["auto"],
+    help="List of dataset parts to prepare (default: small-v1). To prepare multiple parts, pass each with `--subset` "
+    "Example: `--subset all",
+)
+def reazonspeech(target_dir: Pathlike, subset: List[str]):
+    """ReazonSpeech download."""
+    logging.basicConfig(level=logging.INFO)
+    if "auto" in subset:
+        subset = "auto"
+    download_reazonspeech(target_dir, dataset_parts=subset)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -64,6 +64,7 @@
 from .musan import download_musan, prepare_musan
 from .nsc import prepare_nsc
 from .peoples_speech import prepare_peoples_speech
+from .reazonspeech import download_reazonspeech, prepare_reazonspeech
 from .rir_noise import download_rir_noise, prepare_rir_noise
 from .slu import prepare_slu
 from .speechcommands import download_speechcommands, prepare_speechcommands
@@ -180,6 +181,8 @@
     "prepare_musan",
     "prepare_nsc",
     "prepare_peoples_speech",
+    "download_reazonspeech",
+    "prepare_reazonspeech",
     "download_rir_noise",
     "prepare_rir_noise",
     "prepare_slu",

diff --git a/lhotse/recipes/reazonspeech.py b/lhotse/recipes/reazonspeech.py
@@ -0,0 +1,243 @@
+"""
+ReazonSpeech is an open-source dataset that contains a diverse set of natural Japanese speech,
+collected from terrestrial television streams. It contains more than 35,000 hours of audio.
+
+The dataset is available on Hugging Face. For more details, please visit:
+
+Dataset: https://huggingface.co/datasets/reazon-research/reazonspeech
+Paper: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
+"""
+
+import json
+import logging
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Tuple, Union
+
+from tqdm.auto import tqdm
+
+from lhotse import CutSet, fix_manifests, validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.parallel import parallel_map
+from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, is_module_available
+
+REAZONSPEECH = (
+    "tiny",
+    "small",
+    "medium",
+    "large",
+    "all",
+    "small-v1",
+    "medium-v1",
+    "all-v1",
+)
+
+PUNCTUATIONS = {ord(x): "" for x in "、。「」『』，,？！!!?!?"}
+ZENKAKU = "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ０１２３４５６７８９"
+HANKAKU = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+ZEN2HAN = str.maketrans(ZENKAKU, HANKAKU)
+
+
+def normalize(s):
+    """
+    Convert full-width characters to half-width, and remove punctuations.
+    :param s: str, input string.
+    :return: str, normalized string.
+    """
+    if is_module_available("num2words"):
+        import num2words
+    else:
+        raise ImportError(
+            "To process the ReazonSpeech corpus, please install optional dependency: pip install num2words"
+        )
+    s = s.translate(PUNCTUATIONS).translate(ZEN2HAN)
+    conv = lambda m: num2words.num2words(m.group(0), lang="ja")
+    return re.sub(r"\d+\.?\d*", conv, s)
+
+
+def write_to_json(data, filename):
+    """
+    Writes data to a JSON file.
+    :param data: The data to write.
+    :param filename: The name of the file to write to.
+    """
+
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+def download_reazonspeech(
+    target_dir: Pathlike = ".",
+    dataset_parts: Optional[Union[str, Sequence[str]]] = "auto",
+) -> Path:
+    """
+    Download the ReazonSpeech dataset.
+    :param target_dir: Pathlike, the path of the dir to storage the dataset.
+    :param dataset_parts: the parts of the dataset to download (e.g. small, medium, or large).
+    :return: the path to downloaded data and the JSON file.
+    """
+    if is_module_available("datasets"):
+        from datasets import load_dataset
+    else:
+        raise ImportError(
+            "To process the ReazonSpeech corpus, please install optional dependency: pip install datasets"
+        )
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+    corpus_dir = target_dir / "ReazonSpeech"
+
+    if dataset_parts == "auto":
+        dataset_parts = ("small-v1",)
+    elif isinstance(dataset_parts, str):
+        dataset_parts = [dataset_parts]
+
+    for part in dataset_parts:
+        logging.info(f"Downloading ReazonSpeech part: {part}")
+        ds = load_dataset(
+            "reazon-research/reazonspeech",
+            part,
+            trust_remote_code=True,
+            cache_dir=corpus_dir,
+        )["train"]
+
+    # Prepare data for JSON export
+    data_for_json = []
+    idx = 0
+    for item in ds:
+        # Calculate the duration of the audio file
+        audio_array = item["audio"]["array"]
+        sampling_rate = item["audio"]["sampling_rate"]
+        duration = len(audio_array) / float(sampling_rate)
+
+        # Create a dictionary for the current record
+        record = {
+            "id": str(idx),
+            "audio_filepath": item["audio"]["path"],
+            "text": normalize(item["transcription"]),
+            "duration": duration,
+        }
+
+        # Append the record to the list
+        data_for_json.append(record)
+        idx += 1
+
+    # Write data to a JSON file
+    write_to_json(data_for_json, corpus_dir / "dataset.json")
+
+    return corpus_dir
+
+
+def prepare_reazonspeech(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike],
+    num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions.
+    When all the manifests are available in the ``output_dir``, it will simply read and return them.
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
+    """
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    # Split the dataset into train, dev, and test
+    with open(corpus_dir / "dataset.json", "r", encoding="utf-8") as file:
+        full = json.load(file)
+        dev = full[:1000]
+        test = full[1000:1100]
+        train = full[1100:]
+
+        write_to_json(train, corpus_dir / "train.json")
+        write_to_json(dev, corpus_dir / "dev.json")
+        write_to_json(test, corpus_dir / "test.json")
+
+    parts = ("train", "dev", "test")
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Maybe some manifests already exist: we can read them and save a bit of preparation time.
+    manifests = read_manifests_if_cached(
+        dataset_parts=parts,
+        output_dir=output_dir,
+        prefix="reazonspeech",
+        suffix="jsonl.gz",
+        lazy=True,
+    )
+
+    for part in parts:
+        logging.info(f"Processing ReazonSpeech subset: {part}")
+        if manifests_exist(
+            part=part, output_dir=output_dir, prefix="reazonspeech", suffix="jsonl.gz"
+        ):
+            logging.info(f"ReazonSpeech subset: {part} already prepared - skipping.")
+            continue
+
+        filename = corpus_dir / f"{part}.json"
+        with open(filename, "r", encoding="utf-8") as file:
+            items = json.load(file)
+
+        with RecordingSet.open_writer(
+            output_dir / f"reazonspeech_recordings_{part}.jsonl.gz"
+        ) as rec_writer, SupervisionSet.open_writer(
+            output_dir / f"reazonspeech_supervisions_{part}.jsonl.gz"
+        ) as sup_writer, CutSet.open_writer(
+            output_dir / f"reazonspeech_cuts_{part}.jsonl.gz"
+        ) as cut_writer:
+            for recording, segment in tqdm(
+                parallel_map(
+                    parse_utterance,
+                    items,
+                    num_jobs=num_jobs,
+                ),
+                desc="Processing reazonspeech JSON entries",
+            ):
+                # Fix and validate the recording + supervisions
+                recordings, segments = fix_manifests(
+                    recordings=RecordingSet.from_recordings([recording]),
+                    supervisions=SupervisionSet.from_segments([segment]),
+                )
+                validate_recordings_and_supervisions(
+                    recordings=recordings, supervisions=segments
+                )
+                # Create the cut since most users will need it anyway.
+                # There will be exactly one cut since there's exactly one recording.
+                cuts = CutSet.from_manifests(
+                    recordings=recordings, supervisions=segments
+                )
+                # Write the manifests
+                rec_writer.write(recordings[0])
+                sup_writer.write(segments[0])
+                cut_writer.write(cuts[0])
+
+        manifests[part] = {
+            "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
+            "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
+            "cuts": CutSet.from_jsonl_lazy(cut_writer.path),
+        }
+
+    return dict(manifests)
+
+
+def parse_utterance(item: Any) -> Optional[Tuple[Recording, SupervisionSegment]]:
+    """
+    Process a single utterance from the ReazonSpeech dataset.
+    :param item: The utterance to process.
+    :return: A tuple containing the Recording and SupervisionSegment.
+    """
+    recording = Recording.from_file(item["audio_filepath"], recording_id=item["id"])
+    segments = SupervisionSegment(
+        id=item["id"],
+        recording_id=item["id"],
+        start=0.0,
+        duration=item["duration"],
+        channel=0,
+        language="Japanese",
+        text=item["text"],
+    )
+    return recording, segments