-
Notifications
You must be signed in to change notification settings - Fork 221
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Recipe for the Chinese Dysarthric Speech Database (#1423)
* init commit * minor fixes * Update corpus.rst
- Loading branch information
Showing
5 changed files
with
135 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import click | ||
|
||
from lhotse.bin.modes import prepare | ||
from lhotse.recipes.cdsd import prepare_cdsd | ||
from lhotse.utils import Pathlike | ||
|
||
__all__ = ["cdsd"] | ||
|
||
|
||
@prepare.command(context_settings=dict(show_default=True)) | ||
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) | ||
@click.argument("output_dir", type=click.Path()) | ||
def cdsd(corpus_dir: Pathlike, output_dir: Pathlike): | ||
"""CDSD ASR data preparation.""" | ||
prepare_cdsd(corpus_dir, output_dir=output_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
""" | ||
About the CDSD (Chinese Dysarthric Speech Database) dataset: | ||
This database comprises speech data from 24 participants with dysarthria. | ||
Among these participants, one recorded an additional 10 hours of speech data, while each recorded one hour, resulting in 34 hours of speech material. | ||
To accommodate participants with varying cognitive levels, the text pool primarily consists of content from the AISHELL-1 dataset and speeches by primary and secondary school students. When participants read these texts, they must use a mobile device or the ZOOM F8n multi-track field recorder to record their speeches. | ||
In this paper, the authors elucidate the data collection and annotation processes and present an approach for establishing a baseline for dysarthric speech recognition. Furthermore, the authors conducted a speaker-dependent dysarthric speech recognition experiment using an additional 10 hours of speech data from one of the participants. | ||
arXiv link: https://arxiv.org/abs/2310.15930v1 | ||
""" | ||
|
||
import logging | ||
from collections import defaultdict | ||
from pathlib import Path | ||
from typing import Dict, Optional, Union | ||
|
||
from tqdm.auto import tqdm | ||
|
||
from lhotse import validate_recordings_and_supervisions | ||
from lhotse.audio import Recording, RecordingSet | ||
from lhotse.qa import fix_manifests | ||
from lhotse.supervision import SupervisionSegment, SupervisionSet | ||
from lhotse.utils import Pathlike | ||
|
||
|
||
def text_normalize(line: str): | ||
""" | ||
Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54 | ||
sed 's/a/a/g' | sed 's/b/b/g' |\ | ||
sed 's/c/c/g' | sed 's/k/k/g' |\ | ||
sed 's/t/t/g' > $dir/transcripts.t | ||
""" | ||
line = line.replace("a", "a") | ||
line = line.replace("b", "b") | ||
line = line.replace("c", "c") | ||
line = line.replace("k", "k") | ||
line = line.replace("t", "t") | ||
line = line.upper() | ||
return line | ||
|
||
|
||
def prepare_cdsd( | ||
corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None | ||
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: | ||
""" | ||
Returns the manifests which consist of the Recordings and Supervisions | ||
:param corpus_dir: Pathlike, the path of the data dir. | ||
:param output_dir: Pathlike, the path where to write the manifests. | ||
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. | ||
""" | ||
corpus_dir = Path(corpus_dir) | ||
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" | ||
if output_dir is not None: | ||
output_dir = Path(output_dir) | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
manifests = defaultdict(dict) | ||
dataset_parts = ["1h", "10h"] | ||
for part in dataset_parts: | ||
logging.info(f"Processing CDSD subset: {part}") | ||
recordings = [] | ||
supervisions = [] | ||
|
||
txt_path = corpus_dir / "after_catting" / f"{part}" / "Text" | ||
transcript_dict = {} | ||
for text_path in txt_path.rglob("**/*.txt"): | ||
with open(text_path, "r", encoding="utf-8") as f: | ||
for line in f.readlines(): | ||
idx_transcript, content = line.strip().split(maxsplit=1) | ||
content = text_normalize(content) | ||
transcript_dict[idx_transcript] = content | ||
|
||
wav_path = corpus_dir / "after_catting" / f"{part}" / "Audio" | ||
for audio_path in tqdm(wav_path.rglob("**/*.wav"), desc="Processing audio"): | ||
idx = audio_path.stem | ||
speaker = audio_path.parts[-2] | ||
if idx not in transcript_dict: | ||
logging.warning(f"No transcript: {idx}") | ||
logging.warning(f"{audio_path} has no transcript.") | ||
continue | ||
text = transcript_dict[idx] | ||
if not audio_path.is_file(): | ||
logging.warning(f"No such file: {audio_path}") | ||
continue | ||
recording = Recording.from_file(audio_path) | ||
recordings.append(recording) | ||
segment = SupervisionSegment( | ||
id=idx, | ||
recording_id=idx, | ||
start=0.0, | ||
duration=recording.duration, | ||
channel=0, | ||
language="Chinese", | ||
speaker=speaker, | ||
text=text.strip().replace(" ", ""), | ||
# here we remove the space between words in the text | ||
# in advance. | ||
) | ||
supervisions.append(segment) | ||
|
||
recording_set = RecordingSet.from_recordings(recordings) | ||
supervision_set = SupervisionSet.from_segments(supervisions) | ||
recording_set, supervision_set = fix_manifests(recording_set, supervision_set) | ||
validate_recordings_and_supervisions(recording_set, supervision_set) | ||
|
||
if output_dir is not None: | ||
supervision_set.to_file(output_dir / f"cdsd_supervisions_{part}.jsonl.gz") | ||
recording_set.to_file(output_dir / f"cdsd_recordings_{part}.jsonl.gz") | ||
|
||
manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} | ||
|
||
return manifests |