diff --git a/docs/corpus.rst b/docs/corpus.rst index 7182f5224..4497f4904 100644 --- a/docs/corpus.rst +++ b/docs/corpus.rst @@ -75,6 +75,8 @@ a CLI tool that create the manifests given a corpus directory. - :func:`lhotse.recipes.prepare_callhome_egyptian` * - CallHome English - :func:`lhotse.recipes.prepare_callhome_english` + * - Chinese Dysarthric Speech Database + - :func:`lhotse.recipes.prepare_cdsd` * - CHiME-6 - :func:`lhotse.recipes.prepare_chime6` * - CMU Arctic diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py index 913ed56a0..9115396f9 100644 --- a/lhotse/bin/modes/recipes/__init__.py +++ b/lhotse/bin/modes/recipes/__init__.py @@ -17,6 +17,7 @@ from .bvcc import * from .callhome_egyptian import * from .callhome_english import * +from .cdsd import * from .chime6 import * from .cmu_arctic import * from .cmu_indic import * diff --git a/lhotse/bin/modes/recipes/cdsd.py b/lhotse/bin/modes/recipes/cdsd.py new file mode 100644 index 000000000..7c99d6dea --- /dev/null +++ b/lhotse/bin/modes/recipes/cdsd.py @@ -0,0 +1,15 @@ +import click + +from lhotse.bin.modes import prepare +from lhotse.recipes.cdsd import prepare_cdsd +from lhotse.utils import Pathlike + +__all__ = ["cdsd"] + + +@prepare.command(context_settings=dict(show_default=True)) +@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) +@click.argument("output_dir", type=click.Path()) +def cdsd(corpus_dir: Pathlike, output_dir: Pathlike): + """CDSD ASR data preparation.""" + prepare_cdsd(corpus_dir, output_dir=output_dir) diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py index fc7d3670a..f7eee55c6 100644 --- a/lhotse/recipes/__init__.py +++ b/lhotse/recipes/__init__.py @@ -14,6 +14,7 @@ from .bvcc import download_bvcc, prepare_bvcc from .callhome_egyptian import prepare_callhome_egyptian from .callhome_english import prepare_callhome_english +from .cdsd import prepare_cdsd from .chime6 import download_chime6, prepare_chime6 from .cmu_arctic import download_cmu_arctic, prepare_cmu_arctic from .cmu_indic import download_cmu_indic, prepare_cmu_indic diff --git a/lhotse/recipes/cdsd.py b/lhotse/recipes/cdsd.py new file mode 100644 index 000000000..0973f30bc --- /dev/null +++ b/lhotse/recipes/cdsd.py @@ -0,0 +1,116 @@ +""" +About the CDSD (Chinese Dysarthric Speech Database) dataset: + + This database comprises speech data from 24 participants with dysarthria. + + Among these participants, one recorded an additional 10 hours of speech data, while each recorded one hour, resulting in 34 hours of speech material. + + To accommodate participants with varying cognitive levels, the text pool primarily consists of content from the AISHELL-1 dataset and speeches by primary and secondary school students. When participants read these texts, they must use a mobile device or the ZOOM F8n multi-track field recorder to record their speeches. + + In this paper, the authors elucidate the data collection and annotation processes and present an approach for establishing a baseline for dysarthric speech recognition. Furthermore, the authors conducted a speaker-dependent dysarthric speech recognition experiment using an additional 10 hours of speech data from one of the participants. + +arXiv link: https://arxiv.org/abs/2310.15930v1 +""" + +import logging +from collections import defaultdict +from pathlib import Path +from typing import Dict, Optional, Union + +from tqdm.auto import tqdm + +from lhotse import validate_recordings_and_supervisions +from lhotse.audio import Recording, RecordingSet +from lhotse.qa import fix_manifests +from lhotse.supervision import SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike + + +def text_normalize(line: str): + """ + Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54 + sed 's/a/a/g' | sed 's/b/b/g' |\ + sed 's/c/c/g' | sed 's/k/k/g' |\ + sed 's/t/t/g' > $dir/transcripts.t + + """ + line = line.replace("a", "a") + line = line.replace("b", "b") + line = line.replace("c", "c") + line = line.replace("k", "k") + line = line.replace("t", "t") + line = line.upper() + return line + + +def prepare_cdsd( + corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None +) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: + """ + Returns the manifests which consist of the Recordings and Supervisions + :param corpus_dir: Pathlike, the path of the data dir. + :param output_dir: Pathlike, the path where to write the manifests. + :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. + """ + corpus_dir = Path(corpus_dir) + assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" + if output_dir is not None: + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + manifests = defaultdict(dict) + dataset_parts = ["1h", "10h"] + for part in dataset_parts: + logging.info(f"Processing CDSD subset: {part}") + recordings = [] + supervisions = [] + + txt_path = corpus_dir / "after_catting" / f"{part}" / "Text" + transcript_dict = {} + for text_path in txt_path.rglob("**/*.txt"): + with open(text_path, "r", encoding="utf-8") as f: + for line in f.readlines(): + idx_transcript, content = line.strip().split(maxsplit=1) + content = text_normalize(content) + transcript_dict[idx_transcript] = content + + wav_path = corpus_dir / "after_catting" / f"{part}" / "Audio" + for audio_path in tqdm(wav_path.rglob("**/*.wav"), desc="Processing audio"): + idx = audio_path.stem + speaker = audio_path.parts[-2] + if idx not in transcript_dict: + logging.warning(f"No transcript: {idx}") + logging.warning(f"{audio_path} has no transcript.") + continue + text = transcript_dict[idx] + if not audio_path.is_file(): + logging.warning(f"No such file: {audio_path}") + continue + recording = Recording.from_file(audio_path) + recordings.append(recording) + segment = SupervisionSegment( + id=idx, + recording_id=idx, + start=0.0, + duration=recording.duration, + channel=0, + language="Chinese", + speaker=speaker, + text=text.strip().replace(" ", ""), + # here we remove the space between words in the text + # in advance. + ) + supervisions.append(segment) + + recording_set = RecordingSet.from_recordings(recordings) + supervision_set = SupervisionSet.from_segments(supervisions) + recording_set, supervision_set = fix_manifests(recording_set, supervision_set) + validate_recordings_and_supervisions(recording_set, supervision_set) + + if output_dir is not None: + supervision_set.to_file(output_dir / f"cdsd_supervisions_{part}.jsonl.gz") + recording_set.to_file(output_dir / f"cdsd_recordings_{part}.jsonl.gz") + + manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} + + return manifests