Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding LibriSpeech word alignments in supervisions #379

Merged
merged 3 commits into from
Aug 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion lhotse/dataset/speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from lhotse import validate
from lhotse.cut import CutSet
from lhotse.dataset.input_strategies import BatchIO, PrecomputedFeatures
from lhotse.utils import ifnone
from lhotse.utils import compute_num_frames, ifnone


class K2SpeechRecognitionDataset(torch.utils.data.Dataset):
Expand Down Expand Up @@ -130,6 +130,52 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]
if self.return_cuts:
batch['supervisions']['cut'] = [cut for cut in cuts for sup in cut.supervisions]

has_word_alignments = all(
s.alignment is not None and "word" in s.alignment
for c in cuts
for s in c.supervisions
)
if has_word_alignments:
# TODO: might need to refactor BatchIO API to move the following conditional logic
# into these objects (e.g. use like: self.input_strategy.convert_timestamp(),
# that returns either num_frames or num_samples depending on the strategy).
words, starts, ends = [], [], []
frame_shift = cuts[0].frame_shift
sampling_rate = cuts[0].sampling_rate
if frame_shift is None:
try:
frame_shift = self.input_strategy.extractor.frame_shift
except AttributeError:
raise ValueError(
"Can't determine the frame_shift -- it is not present either in cuts or the input_strategy. "
)
for c in cuts:
for s in c.supervisions:
words.append([aliword.symbol for aliword in s.alignment["word"]])
starts.append(
[
compute_num_frames(
aliword.start,
frame_shift=frame_shift,
sampling_rate=sampling_rate,
)
for aliword in s.alignment["word"]
]
)
ends.append(
[
compute_num_frames(
aliword.end,
frame_shift=frame_shift,
sampling_rate=sampling_rate,
)
for aliword in s.alignment["word"]
]
)
batch["supervisions"]["word"] = words
batch["supervisions"]["word_start"] = starts
batch["supervisions"]["word_end"] = ends

return batch


Expand Down
58 changes: 50 additions & 8 deletions lhotse/recipes/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,33 @@
import re
import shutil
import tarfile
import zipfile
from concurrent.futures.thread import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, Optional, Sequence, Tuple, Union
from typing import Dict, List, Optional, Sequence, Tuple, Union

from tqdm.auto import tqdm

from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, urlretrieve_progress
from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, is_module_available, urlretrieve_progress

LIBRISPEECH = ('dev-clean', 'dev-other', 'test-clean', 'test-other',
'train-clean-100', 'train-clean-360', 'train-other-500')
MINI_LIBRISPEECH = ('dev-clean-2', 'train-clean-5')

LIBRISPEECH_ALIGNMENTS_URL = 'https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE'


def download_librispeech(
target_dir: Pathlike = '.',
dataset_parts: Optional[Union[str, Sequence[str]]] = "mini_librispeech",
force_download: Optional[bool] = False,
base_url: Optional[str] = 'http://www.openslr.org/resources'
force_download: bool = False,
alignments: bool = False,
base_url: str = 'http://www.openslr.org/resources',
alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL,
) -> None:
"""
Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech
Expand All @@ -32,7 +37,10 @@ def download_librispeech(
:param dataset_parts: "librispeech", "mini_librispeech",
or a list of splits (e.g. "dev-clean") to download.
:param force_download: Bool, if True, download the tars no matter if the tars exist.
:param alignments: should we download the alignments. The original source is:
https://github.com/CorentinJ/librispeech-alignments
:param base_url: str, the url of the OpenSLR resources.
:param alignments_url: str, the url of LibriSpeech word alignments
"""
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -41,6 +49,8 @@ def download_librispeech(
dataset_parts = LIBRISPEECH
elif dataset_parts == "mini_librispeech":
dataset_parts = MINI_LIBRISPEECH
elif isinstance(dataset_parts, str):
dataset_parts = [dataset_parts]

for part in tqdm(dataset_parts, desc='Downloading LibriSpeech parts'):
logging.info(f'Processing split: {part}')
Expand Down Expand Up @@ -69,6 +79,18 @@ def download_librispeech(
tar.extractall(path=target_dir)
completed_detector.touch()

if alignments:
completed_detector = target_dir / '.ali_completed'
if completed_detector.is_file() and not force_download:
return
assert is_module_available('gdown'), 'To download LibriSpeech alignments, please install "pip install gdown"'
import gdown
ali_zip_path = str(target_dir / 'LibriSpeech-Alignments.zip')
gdown.download(alignments_url, output=ali_zip_path)
with zipfile.ZipFile(ali_zip_path) as f:
f.extractall(path=target_dir)
completed_detector.touch()


def prepare_librispeech(
corpus_dir: Pathlike,
Expand Down Expand Up @@ -123,7 +145,12 @@ def prepare_librispeech(
supervisions = []
part_path = corpus_dir / part
futures = []
for trans_path in tqdm(part_path.rglob('*.txt'), desc='Distributing tasks', leave=False):
for trans_path in tqdm(part_path.rglob('*.trans.txt'), desc='Distributing tasks', leave=False):
alignments = {}
ali_path = trans_path.parent / (trans_path.stem.split('.')[0] + '.alignment.txt')
print(ali_path)
if ali_path.exists():
alignments = parse_alignments(ali_path)
# "trans_path" file contains lines like:
#
# 121-121726-0000 ALSO A POPULAR CONTRIVANCE
Expand All @@ -133,7 +160,7 @@ def prepare_librispeech(
# We will create a separate Recording and SupervisionSegment for those.
with open(trans_path) as f:
for line in f:
futures.append(ex.submit(parse_utterance, part_path, line))
futures.append(ex.submit(parse_utterance, part_path, line, alignments))

for future in tqdm(futures, desc='Processing', leave=False):
result = future.result()
Expand Down Expand Up @@ -163,6 +190,7 @@ def prepare_librispeech(
def parse_utterance(
dataset_split_path: Path,
line: str,
alignments: Dict[str, List[AlignmentItem]],
) -> Optional[Tuple[Recording, SupervisionSegment]]:
recording_id, text = line.strip().split(maxsplit=1)
# Create the Recording first
Expand All @@ -180,6 +208,20 @@ def parse_utterance(
channel=0,
language='English',
speaker=re.sub(r'-.*', r'', recording.id),
text=text.strip()
text=text.strip(),
alignment={"word": alignments[recording_id]} if recording_id in alignments else None
)
return recording, segment


def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
alignments = {}
for line in Path(ali_path).read_text().splitlines():
utt_id, words, timestamps = line.split()
words = words.replace('"', '').split(',')
timestamps = [0.0] + list(map(float, timestamps.replace('"', '').split(',')))
alignments[utt_id] = [
AlignmentItem(symbol=word, start=start, duration=round(end - start, ndigits=8))
for word, start, end in zip(words, timestamps, timestamps[1:])
]
return alignments