Skip to content

Commit

Permalink
Recipe for the Chinese Dysarthric Speech Database (#1423)
Browse files Browse the repository at this point in the history
* init commit

* minor fixes

* Update corpus.rst
  • Loading branch information
JinZr authored Nov 26, 2024
1 parent 5028e4c commit e955f29
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_callhome_egyptian`
* - CallHome English
- :func:`lhotse.recipes.prepare_callhome_english`
* - Chinese Dysarthric Speech Database
- :func:`lhotse.recipes.prepare_cdsd`
* - CHiME-6
- :func:`lhotse.recipes.prepare_chime6`
* - CMU Arctic
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .bvcc import *
from .callhome_egyptian import *
from .callhome_english import *
from .cdsd import *
from .chime6 import *
from .cmu_arctic import *
from .cmu_indic import *
Expand Down
15 changes: 15 additions & 0 deletions lhotse/bin/modes/recipes/cdsd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import click

from lhotse.bin.modes import prepare
from lhotse.recipes.cdsd import prepare_cdsd
from lhotse.utils import Pathlike

__all__ = ["cdsd"]


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
def cdsd(corpus_dir: Pathlike, output_dir: Pathlike):
"""CDSD ASR data preparation."""
prepare_cdsd(corpus_dir, output_dir=output_dir)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .bvcc import download_bvcc, prepare_bvcc
from .callhome_egyptian import prepare_callhome_egyptian
from .callhome_english import prepare_callhome_english
from .cdsd import prepare_cdsd
from .chime6 import download_chime6, prepare_chime6
from .cmu_arctic import download_cmu_arctic, prepare_cmu_arctic
from .cmu_indic import download_cmu_indic, prepare_cmu_indic
Expand Down
116 changes: 116 additions & 0 deletions lhotse/recipes/cdsd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""
About the CDSD (Chinese Dysarthric Speech Database) dataset:
This database comprises speech data from 24 participants with dysarthria.
Among these participants, one recorded an additional 10 hours of speech data, while each recorded one hour, resulting in 34 hours of speech material.
To accommodate participants with varying cognitive levels, the text pool primarily consists of content from the AISHELL-1 dataset and speeches by primary and secondary school students. When participants read these texts, they must use a mobile device or the ZOOM F8n multi-track field recorder to record their speeches.
In this paper, the authors elucidate the data collection and annotation processes and present an approach for establishing a baseline for dysarthric speech recognition. Furthermore, the authors conducted a speaker-dependent dysarthric speech recognition experiment using an additional 10 hours of speech data from one of the participants.
arXiv link: https://arxiv.org/abs/2310.15930v1
"""

import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, Optional, Union

from tqdm.auto import tqdm

from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.qa import fix_manifests
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike


def text_normalize(line: str):
"""
Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54
sed 's/a/a/g' | sed 's/b/b/g' |\
sed 's/c/c/g' | sed 's/k/k/g' |\
sed 's/t/t/g' > $dir/transcripts.t
"""
line = line.replace("a", "a")
line = line.replace("b", "b")
line = line.replace("c", "c")
line = line.replace("k", "k")
line = line.replace("t", "t")
line = line.upper()
return line


def prepare_cdsd(
corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions
:param corpus_dir: Pathlike, the path of the data dir.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
"""
corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

manifests = defaultdict(dict)
dataset_parts = ["1h", "10h"]
for part in dataset_parts:
logging.info(f"Processing CDSD subset: {part}")
recordings = []
supervisions = []

txt_path = corpus_dir / "after_catting" / f"{part}" / "Text"
transcript_dict = {}
for text_path in txt_path.rglob("**/*.txt"):
with open(text_path, "r", encoding="utf-8") as f:
for line in f.readlines():
idx_transcript, content = line.strip().split(maxsplit=1)
content = text_normalize(content)
transcript_dict[idx_transcript] = content

wav_path = corpus_dir / "after_catting" / f"{part}" / "Audio"
for audio_path in tqdm(wav_path.rglob("**/*.wav"), desc="Processing audio"):
idx = audio_path.stem
speaker = audio_path.parts[-2]
if idx not in transcript_dict:
logging.warning(f"No transcript: {idx}")
logging.warning(f"{audio_path} has no transcript.")
continue
text = transcript_dict[idx]
if not audio_path.is_file():
logging.warning(f"No such file: {audio_path}")
continue
recording = Recording.from_file(audio_path)
recordings.append(recording)
segment = SupervisionSegment(
id=idx,
recording_id=idx,
start=0.0,
duration=recording.duration,
channel=0,
language="Chinese",
speaker=speaker,
text=text.strip().replace(" ", ""),
# here we remove the space between words in the text
# in advance.
)
supervisions.append(segment)

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)
recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
validate_recordings_and_supervisions(recording_set, supervision_set)

if output_dir is not None:
supervision_set.to_file(output_dir / f"cdsd_supervisions_{part}.jsonl.gz")
recording_set.to_file(output_dir / f"cdsd_recordings_{part}.jsonl.gz")

manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}

return manifests

0 comments on commit e955f29

Please sign in to comment.