Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recipe for the Chinese Dysarthric Speech Database #1423

Merged
merged 3 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_callhome_egyptian`
* - CallHome English
- :func:`lhotse.recipes.prepare_callhome_english`
* - Chinese Dysarthric Speech Database
- :func:`lhotse.recipes.prepare_cdsd`
* - CHiME-6
- :func:`lhotse.recipes.prepare_chime6`
* - CMU Arctic
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .bvcc import *
from .callhome_egyptian import *
from .callhome_english import *
from .cdsd import *
from .chime6 import *
from .cmu_arctic import *
from .cmu_indic import *
Expand Down
15 changes: 15 additions & 0 deletions lhotse/bin/modes/recipes/cdsd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import click

from lhotse.bin.modes import prepare
from lhotse.recipes.cdsd import prepare_cdsd
from lhotse.utils import Pathlike

__all__ = ["cdsd"]


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
def cdsd(corpus_dir: Pathlike, output_dir: Pathlike):
"""CDSD ASR data preparation."""
prepare_cdsd(corpus_dir, output_dir=output_dir)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .bvcc import download_bvcc, prepare_bvcc
from .callhome_egyptian import prepare_callhome_egyptian
from .callhome_english import prepare_callhome_english
from .cdsd import prepare_cdsd
from .chime6 import download_chime6, prepare_chime6
from .cmu_arctic import download_cmu_arctic, prepare_cmu_arctic
from .cmu_indic import download_cmu_indic, prepare_cmu_indic
Expand Down
116 changes: 116 additions & 0 deletions lhotse/recipes/cdsd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
About the CDSD (Chinese Dysarthric Speech Database) dataset:

This database comprises speech data from 24 participants with dysarthria.

Among these participants, one recorded an additional 10 hours of speech data, while each recorded one hour, resulting in 34 hours of speech material.

To accommodate participants with varying cognitive levels, the text pool primarily consists of content from the AISHELL-1 dataset and speeches by primary and secondary school students. When participants read these texts, they must use a mobile device or the ZOOM F8n multi-track field recorder to record their speeches.

In this paper, the authors elucidate the data collection and annotation processes and present an approach for establishing a baseline for dysarthric speech recognition. Furthermore, the authors conducted a speaker-dependent dysarthric speech recognition experiment using an additional 10 hours of speech data from one of the participants.

arXiv link: https://arxiv.org/abs/2310.15930v1
"""

import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, Optional, Union

from tqdm.auto import tqdm

from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.qa import fix_manifests
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike


def text_normalize(line: str):
"""
Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54
sed 's/a/a/g' | sed 's/b/b/g' |\
sed 's/c/c/g' | sed 's/k/k/g' |\
sed 's/t/t/g' > $dir/transcripts.t

"""
line = line.replace("a", "a")
line = line.replace("b", "b")
line = line.replace("c", "c")
line = line.replace("k", "k")
line = line.replace("t", "t")
line = line.upper()
return line


def prepare_cdsd(
corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions
:param corpus_dir: Pathlike, the path of the data dir.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

manifests = defaultdict(dict)
dataset_parts = ["1h", "10h"]
for part in dataset_parts:
logging.info(f"Processing CDSD subset: {part}")
recordings = []
supervisions = []

txt_path = corpus_dir / "after_catting" / f"{part}" / "Text"
transcript_dict = {}
for text_path in txt_path.rglob("**/*.txt"):
with open(text_path, "r", encoding="utf-8") as f:
for line in f.readlines():
idx_transcript, content = line.strip().split(maxsplit=1)
content = text_normalize(content)
transcript_dict[idx_transcript] = content

wav_path = corpus_dir / "after_catting" / f"{part}" / "Audio"
for audio_path in tqdm(wav_path.rglob("**/*.wav"), desc="Processing audio"):
idx = audio_path.stem
speaker = audio_path.parts[-2]
if idx not in transcript_dict:
logging.warning(f"No transcript: {idx}")
logging.warning(f"{audio_path} has no transcript.")
continue
text = transcript_dict[idx]
if not audio_path.is_file():
logging.warning(f"No such file: {audio_path}")
continue
recording = Recording.from_file(audio_path)
recordings.append(recording)
segment = SupervisionSegment(
id=idx,
recording_id=idx,
start=0.0,
duration=recording.duration,
channel=0,
language="Chinese",
speaker=speaker,
text=text.strip().replace(" ", ""),
# here we remove the space between words in the text
# in advance.
)
supervisions.append(segment)

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)
recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
validate_recordings_and_supervisions(recording_set, supervision_set)

if output_dir is not None:
supervision_set.to_file(output_dir / f"cdsd_supervisions_{part}.jsonl.gz")
recording_set.to_file(output_dir / f"cdsd_recordings_{part}.jsonl.gz")

manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}

return manifests
Loading