diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py index c7121f490..a3aec3ee9 100644 --- a/lhotse/recipes/__init__.py +++ b/lhotse/recipes/__init__.py @@ -1,4 +1,5 @@ from .aishell import prepare_aishell +from .timit import download_timit, prepare_timit from .ami import download_ami, prepare_ami from .babel import prepare_single_babel_language from .broadcast_news import prepare_broadcast_news diff --git a/lhotse/recipes/timit.py b/lhotse/recipes/timit.py new file mode 100644 index 000000000..ec9505cb4 --- /dev/null +++ b/lhotse/recipes/timit.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 + +# Copyright 2021 Xiaomi Corporation (Author: Mingshuang Luo) +# Apache 2.0 + +from collections import defaultdict + +import os +import zipfile +import logging +from tqdm import tqdm +from pathlib import Path +from typing import Dict, Optional, Union +from concurrent.futures.thread import ThreadPoolExecutor + +from lhotse import validate_recordings_and_supervisions +from lhotse.audio import Recording, RecordingSet +from lhotse.supervision import SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike, urlretrieve_progress + +def download_timit( + target_dir: Pathlike = '.', + force_download: bool = False, + base_url: Optional[str] = 'https://data.deepai.org/timit.zip') -> None: + """ + Download and unzip the dataset TIMIT. + :param target_dir: Pathlike, the path of the dir to store the dataset. + :param force_download: bool, if True, download the zips no matter if the zips exists. + :param base_url: str, the URL of the TIMIT dataset to download. + """ + target_dir = Path(target_dir) + target_dir.mkdir(parents=True, exist_ok=True) + tar_name = 'timit.zip' + tar_path = target_dir / tar_name + if force_download or not tar_path.is_file(): + urlretrieve_progress(f'{base_url}', filename=tar_path, desc=f'Downloading and unzip {tar_name}') + + zip_file = zipfile.ZipFile(tar_path) + if os.path.isdir(tar_name[:-4]): + pass + else: + os.mkdir(tar_name[:-4]) + for names in zip_file.namelist(): + zip_file.extract(names, tar_name[:-4]) + zip_file.close() + +def prepare_timit( + corpus_dir: Pathlike, + splits_dir: Pathlike, + output_dir: Optional[Pathlike] = None, + num_phones: int = 48, + num_jobs: int = 1 +) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: + """ + Returns the manifests which consists of the Recodings and Supervisions. + :param corpus_dir: Pathlike, the path of the data dir. + :param splits_dir: Pathlike, the path of the txt files for data division (train, dev, tst). + :param output_dir: Pathlike, the path where to write and save the manifests. + :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value. + :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. + """ + corpus_dir = Path(corpus_dir) + assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' + + splits_dir = Path(splits_dir) + assert splits_dir.is_dir(), f'No such directory: {splits_dir}' + + if output_dir is not None: + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + manifests = defaultdict(dict) + dataset_parts = ['TRAIN', 'DEV', 'TEST'] + + phones_dict = {} + + if num_phones in [60, 48, 39]: + phones_dict = get_phonemes(num_phones) + else: + raise ValueError("The value of num_phones must be in [60, 48, 39].") + + with ThreadPoolExecutor(num_jobs) as ex: + for part in dataset_parts: + file_name = '' + + if part == 'TRAIN': + file_name = splits_dir / 'train_samples.txt' + elif part == 'DEV': + file_name = splits_dir / 'dev_samples.txt' + else: + file_name = splits_dir / 'tst_samples.txt' + wav_files = [] + with open(file_name, 'r') as f: + lines = f.readlines() + for line in lines: + items = line.strip().split(' ') + wav = corpus_dir / items[-1] + wav_files.append(wav) + logging.debug(f'{part} dataset manifest generation.') + recordings = [] + supervisions = [] + + for wav_file in tqdm(wav_files): + items = str(wav_file).strip().split('/') + idx = items[-2] + '-' + items[-1][:-4] + speaker = items[-2] + transcript_file = Path(wav_file).with_suffix('.PHN') + if not Path(wav_file).is_file(): + logging.warning(f'No such file: {wav_file}') + continue + if not Path(transcript_file).is_file(): + logging.warning(f'No transcript: {transcript_file}') + continue + text = [] + with open(transcript_file, 'r') as f: + lines = f.readlines() + for line in lines: + phone = line.rstrip('\n').split(' ')[-1] + if num_phones != 60: phone = phones_dict[str(phone)] + text.append(phone) + + text = ' '.join(text).replace('h#', 'sil') + + recording = Recording.from_file(path=wav_file, recording_id=idx) + recordings.append(recording) + segment = SupervisionSegment( + id=idx, + recording_id=idx, + start=0.0, + duration=recording.duration, + channel=0, + language='English', + speaker=speaker, + text=text.strip()) + + supervisions.append(segment) + + recording_set = RecordingSet.from_recordings(recordings) + supervision_set = SupervisionSet.from_segments(supervisions) + validate_recordings_and_supervisions(recording_set, supervision_set) + + if output_dir is not None: + supervision_set.to_json(output_dir / f'supervisions_{part}.json') + recording_set.to_json(output_dir / f'recordings_{part}.json') + + manifests[part] = { + 'recordings': recording_set, + 'supervisions': supervision_set} + + return manifests + +def get_phonemes(num_phones): + """ + Choose and convert the phones for modeling. + :param num_phones: the number of phones for modeling. + """ + phonemes = {} + + if num_phones == int(48): + logging.debug("Using 48 phones for modeling!") + # This dictionary is used to convert the 60 phoneme set into the 48 one. + phonemes["sil"] = "sil" + phonemes["aa"] = "aa" + phonemes["ae"] = "ae" + phonemes["ah"] = "ah" + phonemes["ao"] = "ao" + phonemes["aw"] = "aw" + phonemes["ax"] = "ax" + phonemes["ax-h"] = "ax" + phonemes["axr"] = "er" + phonemes["ay"] = "ay" + phonemes["b"] = "b" + phonemes["bcl"] = "vcl" + phonemes["ch"] = "ch" + phonemes["d"] = "d" + phonemes["dcl"] = "vcl" + phonemes["dh"] = "dh" + phonemes["dx"] = "dx" + phonemes["eh"] = "eh" + phonemes["el"] = "el" + phonemes["em"] = "m" + phonemes["en"] = "en" + phonemes["eng"] = "ng" + phonemes["epi"] = "epi" + phonemes["er"] = "er" + phonemes["ey"] = "ey" + phonemes["f"] = "f" + phonemes["g"] = "g" + phonemes["gcl"] = "vcl" + phonemes["h#"] = "sil" + phonemes["hh"] = "hh" + phonemes["hv"] = "hh" + phonemes["ih"] = "ih" + phonemes["ix"] = "ix" + phonemes["iy"] = "iy" + phonemes["jh"] = "jh" + phonemes["k"] = "k" + phonemes["kcl"] = "cl" + phonemes["l"] = "l" + phonemes["m"] = "m" + phonemes["n"] = "n" + phonemes["ng"] = "ng" + phonemes["nx"] = "n" + phonemes["ow"] = "ow" + phonemes["oy"] = "oy" + phonemes["p"] = "p" + phonemes["pau"] = "sil" + phonemes["pcl"] = "cl" + phonemes["q"] = "" + phonemes["r"] = "r" + phonemes["s"] = "s" + phonemes["sh"] = "sh" + phonemes["t"] = "t" + phonemes["tcl"] = "cl" + phonemes["th"] = "th" + phonemes["uh"] = "uh" + phonemes["uw"] = "uw" + phonemes["ux"] = "uw" + phonemes["v"] = "v" + phonemes["w"] = "w" + phonemes["y"] = "y" + phonemes["z"] = "z" + phonemes["zh"] = "zh" + + elif num_phones == int(39): + logging.debug("Using 39 phones for modeling!") + # This dictionary is used to convert the 60 phoneme set into the 39 one. + phonemes["sil"] = "sil" + phonemes["aa"] = "aa" + phonemes["ae"] = "ae" + phonemes["ah"] = "ah" + phonemes["ao"] = "aa" + phonemes["aw"] = "aw" + phonemes["ax"] = "ah" + phonemes["ax-h"] = "ah" + phonemes["axr"] = "er" + phonemes["ay"] = "ay" + phonemes["b"] = "b" + phonemes["bcl"] = "sil" + phonemes["ch"] = "ch" + phonemes["d"] = "d" + phonemes["dcl"] = "sil" + phonemes["dh"] = "dh" + phonemes["dx"] = "dx" + phonemes["eh"] = "eh" + phonemes["el"] = "l" + phonemes["em"] = "m" + phonemes["en"] = "n" + phonemes["eng"] = "ng" + phonemes["epi"] = "sil" + phonemes["er"] = "er" + phonemes["ey"] = "ey" + phonemes["f"] = "f" + phonemes["g"] = "g" + phonemes["gcl"] = "sil" + phonemes["h#"] = "sil" + phonemes["hh"] = "hh" + phonemes["hv"] = "hh" + phonemes["ih"] = "ih" + phonemes["ix"] = "ih" + phonemes["iy"] = "iy" + phonemes["jh"] = "jh" + phonemes["k"] = "k" + phonemes["kcl"] = "sil" + phonemes["l"] = "l" + phonemes["m"] = "m" + phonemes["ng"] = "ng" + phonemes["n"] = "n" + phonemes["nx"] = "n" + phonemes["ow"] = "ow" + phonemes["oy"] = "oy" + phonemes["p"] = "p" + phonemes["pau"] = "sil" + phonemes["pcl"] = "sil" + phonemes["q"] = "" + phonemes["r"] = "r" + phonemes["s"] = "s" + phonemes["sh"] = "sh" + phonemes["t"] = "t" + phonemes["tcl"] = "sil" + phonemes["th"] = "th" + phonemes["uh"] = "uh" + phonemes["uw"] = "uw" + phonemes["ux"] = "uw" + phonemes["v"] = "v" + phonemes["w"] = "w" + phonemes["y"] = "y" + phonemes["z"] = "z" + phonemes["zh"] = "sh" + + else: + logging.debug("Using 60 phones for modeling!") + + return phonemes