lhotse-speech · pzelasko · Nov 26, 2024 · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -75,6 +75,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_callhome_egyptian`
   * - CallHome English
     - :func:`lhotse.recipes.prepare_callhome_english`
+  * - Chinese Dysarthric Speech Database
+    - :func:`lhotse.recipes.prepare_cdsd`
   * - CHiME-6
     - :func:`lhotse.recipes.prepare_chime6`
   * - CMU Arctic

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -17,6 +17,7 @@
 from .bvcc import *
 from .callhome_egyptian import *
 from .callhome_english import *
+from .cdsd import *
 from .chime6 import *
 from .cmu_arctic import *
 from .cmu_indic import *

diff --git a/lhotse/bin/modes/recipes/cdsd.py b/lhotse/bin/modes/recipes/cdsd.py
@@ -0,0 +1,15 @@
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.cdsd import prepare_cdsd
+from lhotse.utils import Pathlike
+
+__all__ = ["cdsd"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+def cdsd(corpus_dir: Pathlike, output_dir: Pathlike):
+    """CDSD ASR data preparation."""
+    prepare_cdsd(corpus_dir, output_dir=output_dir)
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -14,6 +14,7 @@
 from .bvcc import download_bvcc, prepare_bvcc
 from .callhome_egyptian import prepare_callhome_egyptian
 from .callhome_english import prepare_callhome_english
+from .cdsd import prepare_cdsd
 from .chime6 import download_chime6, prepare_chime6
 from .cmu_arctic import download_cmu_arctic, prepare_cmu_arctic
 from .cmu_indic import download_cmu_indic, prepare_cmu_indic

diff --git a/lhotse/recipes/cdsd.py b/lhotse/recipes/cdsd.py
@@ -0,0 +1,116 @@
+"""
+About the CDSD (Chinese Dysarthric Speech Database) dataset:
+
+    This database comprises speech data from 24 participants with dysarthria. 
+
+    Among these participants, one recorded an additional 10 hours of speech data, while each recorded one hour, resulting in 34 hours of speech material. 
+
+    To accommodate participants with varying cognitive levels, the text pool primarily consists of content from the AISHELL-1 dataset and speeches by primary and secondary school students. When participants read these texts, they must use a mobile device or the ZOOM F8n multi-track field recorder to record their speeches. 
+
+    In this paper, the authors elucidate the data collection and annotation processes and present an approach for establishing a baseline for dysarthric speech recognition. Furthermore, the authors conducted a speaker-dependent dysarthric speech recognition experiment using an additional 10 hours of speech data from one of the participants. 
+
+arXiv link: https://arxiv.org/abs/2310.15930v1
+"""
+
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from tqdm.auto import tqdm
+
+from lhotse import validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike
+
+
+def text_normalize(line: str):
+    """
+    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54
+    sed 's/ａ/a/g' | sed 's/ｂ/b/g' |\
+    sed 's/ｃ/c/g' | sed 's/ｋ/k/g' |\
+    sed 's/ｔ/t/g' > $dir/transcripts.t
+
+    """
+    line = line.replace("ａ", "a")
+    line = line.replace("ｂ", "b")
+    line = line.replace("ｃ", "c")
+    line = line.replace("ｋ", "k")
+    line = line.replace("ｔ", "t")
+    line = line.upper()
+    return line
+
+
+def prepare_cdsd(
+    corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
+    """
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    manifests = defaultdict(dict)
+    dataset_parts = ["1h", "10h"]
+    for part in dataset_parts:
+        logging.info(f"Processing CDSD subset: {part}")
+        recordings = []
+        supervisions = []
+
+        txt_path = corpus_dir / "after_catting" / f"{part}" / "Text"
+        transcript_dict = {}
+        for text_path in txt_path.rglob("**/*.txt"):
+            with open(text_path, "r", encoding="utf-8") as f:
+                for line in f.readlines():
+                    idx_transcript, content = line.strip().split(maxsplit=1)
+                    content = text_normalize(content)
+                    transcript_dict[idx_transcript] = content
+
+        wav_path = corpus_dir / "after_catting" / f"{part}" / "Audio"
+        for audio_path in tqdm(wav_path.rglob("**/*.wav"), desc="Processing audio"):
+            idx = audio_path.stem
+            speaker = audio_path.parts[-2]
+            if idx not in transcript_dict:
+                logging.warning(f"No transcript: {idx}")
+                logging.warning(f"{audio_path} has no transcript.")
+                continue
+            text = transcript_dict[idx]
+            if not audio_path.is_file():
+                logging.warning(f"No such file: {audio_path}")
+                continue
+            recording = Recording.from_file(audio_path)
+            recordings.append(recording)
+            segment = SupervisionSegment(
+                id=idx,
+                recording_id=idx,
+                start=0.0,
+                duration=recording.duration,
+                channel=0,
+                language="Chinese",
+                speaker=speaker,
+                text=text.strip().replace(" ", ""),
+                # here we remove the space between words in the text
+                # in advance.
+            )
+            supervisions.append(segment)
+
+        recording_set = RecordingSet.from_recordings(recordings)
+        supervision_set = SupervisionSet.from_segments(supervisions)
+        recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
+        if output_dir is not None:
+            supervision_set.to_file(output_dir / f"cdsd_supervisions_{part}.jsonl.gz")
+            recording_set.to_file(output_dir / f"cdsd_recordings_{part}.jsonl.gz")
+
+        manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}
+
+    return manifests