Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get timestamps during decoding #598

Merged
merged 20 commits into from
Nov 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions egs/librispeech/ASR/add_alignments.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some doc describing what this file does?

set -eou pipefail

alignments_dir=data/alignment
cuts_in_dir=data/fbank
cuts_out_dir=data/fbank_ali

python3 ./local/add_alignment_librispeech.py \
--alignments-dir $alignments_dir \
--cuts-in-dir $cuts_in_dir \
--cuts-out-dir $cuts_out_dir
196 changes: 196 additions & 0 deletions egs/librispeech/ASR/local/add_alignment_librispeech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/usr/bin/env python3
# Copyright 2022 Xiaomi Corp. (authors: Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
This file adds alignments from https://github.com/CorentinJ/librispeech-alignments # noqa
to the existing fbank features dir (e.g., data/fbank)
and save cuts to a new dir (e.g., data/fbank_ali).
"""

import argparse
import logging
import zipfile
from pathlib import Path
from typing import List

from lhotse import CutSet, load_manifest_lazy
from lhotse.recipes.librispeech import parse_alignments
from lhotse.utils import is_module_available

LIBRISPEECH_ALIGNMENTS_URL = (
"https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some users from China may not have access to Google.
Could you make a copy of this data and upload it to huggingface?
You can leave a comment here saying that the alignment can also be found on huggingface.

)

DATASET_PARTS = [
"dev-clean",
"dev-other",
"test-clean",
"test-other",
"train-clean-100",
"train-clean-360",
"train-other-500",
]


def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument(
"--alignments-dir",
type=str,
default="data/alignment",
help="The dir to save alignments.",
)

parser.add_argument(
"--cuts-in-dir",
type=str,
default="data/fbank",
help="The dir of the existing cuts without alignments.",
)

parser.add_argument(
"--cuts-out-dir",
type=str,
default="data/fbank_ali",
help="The dir to save the new cuts with alignments",
)

return parser


def download_alignments(
target_dir: str, alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL
):
"""
Download and extract the alignments.

Note: If you can not access drive.google.com, you could download the file
`LibriSpeech-Alignments.zip` from huggingface:
https://huggingface.co/Zengwei/librispeech-alignments
and extract the zip file manually.

Args:
target_dir:
The dir to save alignments.
alignments_url:
The URL of alignments.
"""
"""Modified from https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/librispeech.py""" # noqa
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
completed_detector = target_dir / ".ali_completed"
if completed_detector.is_file():
logging.info("The alignment files already exist.")
return

ali_zip_path = target_dir / "LibriSpeech-Alignments.zip"
if not ali_zip_path.is_file():
assert is_module_available(
"gdown"
), 'To download LibriSpeech alignments, please install "pip install gdown"' # noqa
import gdown

gdown.download(alignments_url, output=str(ali_zip_path))

with zipfile.ZipFile(str(ali_zip_path)) as f:
f.extractall(path=target_dir)
completed_detector.touch()


def add_alignment(
alignments_dir: str,
cuts_in_dir: str = "data/fbank",
cuts_out_dir: str = "data/fbank_ali",
dataset_parts: List[str] = DATASET_PARTS,
):
"""
Add alignment info to existing cuts.

Args:
alignments_dir:
The dir of the alignments.
cuts_in_dir:
The dir of the existing cuts.
cuts_out_dir:
The dir to save the new cuts with alignments.
dataset_parts:
Librispeech parts to add alignments.
"""
alignments_dir = Path(alignments_dir)
cuts_in_dir = Path(cuts_in_dir)
cuts_out_dir = Path(cuts_out_dir)
cuts_out_dir.mkdir(parents=True, exist_ok=True)

for part in dataset_parts:
logging.info(f"Processing {part}")

cuts_in_path = cuts_in_dir / f"librispeech_cuts_{part}.jsonl.gz"
if not cuts_in_path.is_file():
logging.info(f"{cuts_in_path} does not exist - skipping.")
continue
cuts_out_path = cuts_out_dir / f"librispeech_cuts_{part}.jsonl.gz"
if cuts_out_path.is_file():
logging.info(f"{part} already exists - skipping.")
continue

# parse alignments
alignments = {}
part_ali_dir = alignments_dir / "LibriSpeech" / part
for ali_path in part_ali_dir.rglob("*.alignment.txt"):
ali = parse_alignments(ali_path)
alignments.update(ali)
logging.info(
f"{part} has {len(alignments.keys())} cuts with alignments."
)

# add alignment attribute and write out
cuts_in = load_manifest_lazy(cuts_in_path)
with CutSet.open_writer(cuts_out_path) as writer:
for cut in cuts_in:
for idx, subcut in enumerate(cut.supervisions):
origin_id = subcut.id.split("_")[0]
if origin_id in alignments:
ali = alignments[origin_id]
else:
logging.info(
f"Warning: {origin_id} does not has alignment."
)
ali = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you print a warning for those cuts that don't have an alignment?

subcut.alignment = {"word": ali}
writer.write(cut, flush=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest that at the end we print the following information:

  • IDs of cuts that don't have alignments
  • IDs of alignments that don't have a corresponding cut.



def main():
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)

parser = get_parser()
args = parser.parse_args()
logging.info(vars(args))

download_alignments(args.alignments_dir)
add_alignment(args.alignments_dir, args.cuts_in_dir, args.cuts_out_dir)


if __name__ == "__main__":
main()
Loading