-
Notifications
You must be signed in to change notification settings - Fork 295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Get timestamps during decoding #598
Changes from all commits
0fcdd15
09f3e57
768b896
077719c
104dce5
eaff20d
58a3160
c0379c6
6494e0f
78564c0
5895f13
0bf552b
6b7e467
8139e7d
51f5929
df1f522
9c1780c
233f631
65ef93c
0bae39a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -eou pipefail | ||
|
||
alignments_dir=data/alignment | ||
cuts_in_dir=data/fbank | ||
cuts_out_dir=data/fbank_ali | ||
|
||
python3 ./local/add_alignment_librispeech.py \ | ||
--alignments-dir $alignments_dir \ | ||
--cuts-in-dir $cuts_in_dir \ | ||
--cuts-out-dir $cuts_out_dir |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright 2022 Xiaomi Corp. (authors: Zengwei Yao) | ||
# | ||
# See ../../../../LICENSE for clarification regarding multiple authors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
""" | ||
This file adds alignments from https://github.com/CorentinJ/librispeech-alignments # noqa | ||
to the existing fbank features dir (e.g., data/fbank) | ||
and save cuts to a new dir (e.g., data/fbank_ali). | ||
""" | ||
|
||
import argparse | ||
import logging | ||
import zipfile | ||
from pathlib import Path | ||
from typing import List | ||
|
||
from lhotse import CutSet, load_manifest_lazy | ||
from lhotse.recipes.librispeech import parse_alignments | ||
from lhotse.utils import is_module_available | ||
|
||
LIBRISPEECH_ALIGNMENTS_URL = ( | ||
"https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some users from China may not have access to Google. |
||
) | ||
|
||
DATASET_PARTS = [ | ||
"dev-clean", | ||
"dev-other", | ||
"test-clean", | ||
"test-other", | ||
"train-clean-100", | ||
"train-clean-360", | ||
"train-other-500", | ||
] | ||
|
||
|
||
def get_parser(): | ||
parser = argparse.ArgumentParser( | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
parser.add_argument( | ||
"--alignments-dir", | ||
type=str, | ||
default="data/alignment", | ||
help="The dir to save alignments.", | ||
) | ||
|
||
parser.add_argument( | ||
"--cuts-in-dir", | ||
type=str, | ||
default="data/fbank", | ||
help="The dir of the existing cuts without alignments.", | ||
) | ||
|
||
parser.add_argument( | ||
"--cuts-out-dir", | ||
type=str, | ||
default="data/fbank_ali", | ||
help="The dir to save the new cuts with alignments", | ||
) | ||
|
||
return parser | ||
|
||
|
||
def download_alignments( | ||
target_dir: str, alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL | ||
): | ||
""" | ||
Download and extract the alignments. | ||
|
||
Note: If you can not access drive.google.com, you could download the file | ||
`LibriSpeech-Alignments.zip` from huggingface: | ||
https://huggingface.co/Zengwei/librispeech-alignments | ||
and extract the zip file manually. | ||
|
||
Args: | ||
target_dir: | ||
The dir to save alignments. | ||
alignments_url: | ||
The URL of alignments. | ||
""" | ||
"""Modified from https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/librispeech.py""" # noqa | ||
target_dir = Path(target_dir) | ||
target_dir.mkdir(parents=True, exist_ok=True) | ||
completed_detector = target_dir / ".ali_completed" | ||
if completed_detector.is_file(): | ||
logging.info("The alignment files already exist.") | ||
return | ||
|
||
ali_zip_path = target_dir / "LibriSpeech-Alignments.zip" | ||
if not ali_zip_path.is_file(): | ||
assert is_module_available( | ||
"gdown" | ||
), 'To download LibriSpeech alignments, please install "pip install gdown"' # noqa | ||
import gdown | ||
|
||
gdown.download(alignments_url, output=str(ali_zip_path)) | ||
|
||
with zipfile.ZipFile(str(ali_zip_path)) as f: | ||
f.extractall(path=target_dir) | ||
completed_detector.touch() | ||
|
||
|
||
def add_alignment( | ||
alignments_dir: str, | ||
cuts_in_dir: str = "data/fbank", | ||
cuts_out_dir: str = "data/fbank_ali", | ||
dataset_parts: List[str] = DATASET_PARTS, | ||
): | ||
""" | ||
Add alignment info to existing cuts. | ||
|
||
Args: | ||
alignments_dir: | ||
The dir of the alignments. | ||
cuts_in_dir: | ||
The dir of the existing cuts. | ||
cuts_out_dir: | ||
The dir to save the new cuts with alignments. | ||
dataset_parts: | ||
Librispeech parts to add alignments. | ||
""" | ||
alignments_dir = Path(alignments_dir) | ||
cuts_in_dir = Path(cuts_in_dir) | ||
cuts_out_dir = Path(cuts_out_dir) | ||
cuts_out_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
for part in dataset_parts: | ||
logging.info(f"Processing {part}") | ||
|
||
cuts_in_path = cuts_in_dir / f"librispeech_cuts_{part}.jsonl.gz" | ||
if not cuts_in_path.is_file(): | ||
logging.info(f"{cuts_in_path} does not exist - skipping.") | ||
continue | ||
cuts_out_path = cuts_out_dir / f"librispeech_cuts_{part}.jsonl.gz" | ||
if cuts_out_path.is_file(): | ||
logging.info(f"{part} already exists - skipping.") | ||
continue | ||
|
||
# parse alignments | ||
alignments = {} | ||
part_ali_dir = alignments_dir / "LibriSpeech" / part | ||
for ali_path in part_ali_dir.rglob("*.alignment.txt"): | ||
ali = parse_alignments(ali_path) | ||
alignments.update(ali) | ||
logging.info( | ||
f"{part} has {len(alignments.keys())} cuts with alignments." | ||
) | ||
|
||
# add alignment attribute and write out | ||
cuts_in = load_manifest_lazy(cuts_in_path) | ||
with CutSet.open_writer(cuts_out_path) as writer: | ||
for cut in cuts_in: | ||
for idx, subcut in enumerate(cut.supervisions): | ||
origin_id = subcut.id.split("_")[0] | ||
if origin_id in alignments: | ||
ali = alignments[origin_id] | ||
else: | ||
logging.info( | ||
f"Warning: {origin_id} does not has alignment." | ||
) | ||
ali = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you print a warning for those cuts that don't have an alignment? |
||
subcut.alignment = {"word": ali} | ||
writer.write(cut, flush=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest that at the end we print the following information:
|
||
|
||
|
||
def main(): | ||
formatter = ( | ||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" | ||
) | ||
logging.basicConfig(format=formatter, level=logging.INFO) | ||
|
||
parser = get_parser() | ||
args = parser.parse_args() | ||
logging.info(vars(args)) | ||
|
||
download_alignments(args.alignments_dir) | ||
add_alignment(args.alignments_dir, args.cuts_in_dir, args.cuts_out_dir) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add some doc describing what this file does?