Add subtitle alignment recipe (#65)

k2-fsa · Jun 3, 2024 · 7c452ed · 7c452ed
1 parent bc2bed3
commit 7c452ed
Show file tree

Hide file tree

Showing 3 changed files with 494 additions and 0 deletions.
diff --git a/examples/subtitle/matching.py b/examples/subtitle/matching.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# Copyright    2024  Xiaomi Corp.        (authors: Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import numpy as np
+import os
+from datetime import datetime
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from typing import Any, Dict, List, Set, Optional, Tuple, Union
+
+from lhotse import CutSet, MonoCut, SupervisionSegment, load_manifest_lazy
+from lhotse.serialization import SequentialJsonlWriter
+from textsearch import (
+    AttributeDict,
+    TextSource,
+    Transcript,
+    levenshtein_distance,
+)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--manifest-in",
+        type=str,
+        help="""The manifest generated by transcript stage containing book path,
+        recordings path and recognition results.
+        """,
+    )
+    parser.add_argument(
+        "--manifest-out",
+        type=str,
+        help="""The file name of the new manifests to write to. 
+        """,
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=50,
+        help="""The number of cuts in a batch.
+        """,
+    )
+    return parser.parse_args()
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing matching parameters.
+
+    All related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    """
+    params = AttributeDict(
+        {
+            # parameters for loading source texts
+            # you can find the docs in textsearch/datatypes.py
+            "use_utf8": False,
+            "is_bpe": True,
+            "use_uppercase": True,
+            # parameters for adding timestamps for each subtitle
+            "duration_add_on_left": 0.0,
+            "duration_add_on_right": 0.0,
+            "max_error_rate": 0.10,
+        }
+    )
+
+    return params
+
+
+def write(
+    params: AttributeDict,
+    batch_cuts: List[MonoCut],
+    results,
+    cuts_writer: SequentialJsonlWriter,
+):
+    """
+    Write the segmented results to disk as new manifests.
+
+    Args:
+      batch_cuts:
+        The original batch cuts.
+      results:
+        Returned from `align`.
+      cuts_writer:
+        The writer used to write the new manifests out.
+    """
+    cut_keep_sup: Dict[str, List[int]] = {}
+    cut_list = []
+    for item in results:
+        if item is None:
+            continue
+        cut_index, sup_index = item[0]
+        start_time, end_time = item[1], item[2]
+
+        current_cut = batch_cuts[cut_index]
+        if current_cut.id not in cut_keep_sup:
+            cut_keep_sup[current_cut.id] = [sup_index]
+        else:
+            cut_keep_sup[current_cut.id].append(sup_index)
+
+        current_cut.supervisions[sup_index].start = start_time
+        current_cut.supervisions[sup_index].duration = end_time - start_time
+        current_cut.supervisions[sup_index].alignment = None
+
+    for j in range(len(batch_cuts) - 1, -1, -1):
+        if batch_cuts[j].id not in cut_keep_sup:
+            del batch_cuts[j]
+            continue
+        keep_sups = set(cut_keep_sup[batch_cuts[j].id])
+        for i in range(len(batch_cuts[j].supervisions) - 1, -1, -1):
+            if i not in keep_sups:
+                del cut.supervisions[i]
+        if len(batch_cuts[j].supervisions) == 0:
+            del batch_cuts[j]
+
+    logging.debug(f"Writing results.")
+    for i, cut in enumerate(batch_cuts):
+        # Flushing only on last cut to accelerate writing.
+        cuts_writer.write(cut, flush=(i == len(batch_cuts) - 1))
+    logging.debug(f"Write results done.")
+
+
+def align(
+    transcript: Transcript,
+    reference: TextSource,
+    cut_index: Tuple[int, int],
+    max_error_rate: float,
+):
+    distance, alignment = levenshtein_distance(
+        reference.binary_text, transcript.binary_text
+    )
+    ref_length = reference.binary_text.size
+    if distance / ref_length > max_error_rate:
+        return None
+    start, end, _ = alignment[
+        0
+    ]  # select the first alignment, normally it will be only one alignment
+
+    # The times is in byte level.
+    time_stride = 1 if transcript.binary_text.dtype == np.uint8 else 4
+
+    end = end + 1 if end + 1 < transcript.binary_text.size else end
+
+    start_time = float(transcript.times[start * time_stride])
+    end_time = float(transcript.times[end * time_stride])
+
+    return (cut_index, start_time, end_time)
+
+
+def process_one_batch(
+    params: AttributeDict,
+    batch_cuts: List[MonoCut],
+    thread_pool: ThreadPool,
+    cuts_writer: SequentialJsonlWriter,
+):
+    # Contains cut index and local supervision index
+    transcripts_cut_index: List[Tuple[int, int]] = []
+    transcripts: List[Transcript] = []
+    texts: List[TextSource] = []
+
+    arguments: List[Tuple[Transcript, TextSource, Tuple[int, int], float]] = []
+
+    # Construct transcript and textsource
+    for i, cut in enumerate(batch_cuts):
+        for j, sup in enumerate(cut.supervisions):
+            # Transcript requires the input to be the dict like this.
+            text_list = []
+            begin_times_list = []
+            for ali in sup.alignment["symbol"]:
+                text_list.append(ali.symbol)
+                begin_times_list.append(ali.start)
+            aligns = {"text": text_list, "begin_times": begin_times_list}
+            # alignments in a supervision might be empty
+            if aligns["text"]:
+                transcript = Transcript.from_dict(
+                    name=sup.id,
+                    d=aligns,
+                    use_utf8=params.use_utf8,
+                    is_bpe=params.is_bpe,
+                )
+
+                text = TextSource.from_str(
+                    name=sup.id,
+                    s=sup.text,
+                    use_utf8=params.use_utf8,
+                )
+                arguments.append(
+                    (transcript, text, (i, j), params.max_error_rate)
+                )
+    logging.debug(f"Aligning with levenshtein for {len(arguments)} segments.")
+    async_results = thread_pool.starmap_async(align, arguments)
+    results = async_results.get()
+    logging.debug("Aligning with levenshtein done.")
+
+    write(
+        params=params,
+        batch_cuts=batch_cuts,
+        results=results,
+        cuts_writer=cuts_writer,
+    )
+
+
+def main():
+    args = get_args()
+    params = get_params()
+    params.update(vars(args))
+
+    logging.info(f"params : {params}")
+
+    raw_cuts = load_manifest_lazy(params.manifest_in)
+    cuts_writer = CutSet.open_writer(params.manifest_out, overwrite=True)
+
+    # thread_pool to run the levenshtein alignment.
+    # we use thread_pool here because the levenshtein run on C++ with GIL released.
+    thread_pool = ThreadPool()
+
+    batch_cuts = []
+    logging.info(f"Start processing...")
+    for i, cut in enumerate(raw_cuts):
+        if len(batch_cuts) >= params.batch_size:
+            process_one_batch(
+                params,
+                batch_cuts=batch_cuts,
+                thread_pool=thread_pool,
+                cuts_writer=cuts_writer,
+            )
+            batch_cuts = []
+            logging.info(f"Number of cuts have been loaded is {i}")
+        batch_cuts.append(cut)
+    if len(batch_cuts):
+        process_one_batch(
+            params,
+            batch_cuts=batch_cuts,
+            thread_pool=thread_pool,
+            cuts_writer=cuts_writer,
+        )
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+    now = datetime.now()
+    data_time = now.strftime("%Y-%m-%d-%H-%M-%S")
+    os.makedirs("logs", exist_ok=True)
+    log_file_name = f"logs/matching_{data_time}"
+    logging.basicConfig(
+        level=logging.INFO,
+        format=formatter,
+        handlers=[logging.FileHandler(log_file_name), logging.StreamHandler()],
+    )
+
+    main()