From b498d438fc4c35ebf364a9a1c5cd3e29a2c0fe50 Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Thu, 3 Aug 2023 14:41:09 -0700
Subject: [PATCH] NFA subtitle file config - specify colors and vertical
 alignment (#7160)

* allow specifying colors of text in ASS subtitle file

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* specify vertical_alignment instead of marginv in ass_file_config

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* add documentation of CTMFileConfig and ASSFileConfig to NFA README

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
---
 tools/nemo_forced_aligner/README.md           | 14 ++++
 tools/nemo_forced_aligner/align.py            | 21 +++++-
 .../utils/make_ass_files.py                   | 64 ++++++++++++++-----
 3 files changed, 83 insertions(+), 16 deletions(-)
diff --git a/tools/nemo_forced_aligner/README.md b/tools/nemo_forced_aligner/README.md
index 423c76878db6..1fe9097db3d6 100644
--- a/tools/nemo_forced_aligner/README.md
+++ b/tools/nemo_forced_aligner/README.md
@@ -82,12 +82,26 @@ Each CTM file will contain lines of the format:
 `<utt_id> 1 <start time in seconds> <duration in seconds> <text, ie token/word/segment>`.
 Note the second item in the line (the 'channel ID', which is required by the CTM file format) is always 1, as NFA operates on single channel audio.
 
+### `CTMFileConfig` parameters
+The `CTMFileConfig` (which is passed into the main NFA config) has the following parameters:
+* `remove_blank_tokens`: bool (default `False`) to specify if the token-level CTM files should have the timestamps of the blank tokens removed.
+* `minimum_timestamp_duration`: float (default `0`) to specify the minimum duration that will be applied to all timestamps. If any line in the CTM has a duration lower than this, it will be enlarged from the middle outwards until it meets the `minimum_timestamp_duration`, or reaches the beginning or end of the audio file. Note that using a non-zero value may cause timestamps to overlap.
+
 # Output ASS file format
 NFA will produce the following ASS files, which you can use to generate subtitle videos:
 * ASS files with token-level highlighting will be at `<output_dir>/ass/tokens/<utt_id>.ass,`
 * ASS files with word-level highlighting will be at `<output_dir>/ass/words/<utt_id>.ass`.
 All words belonging to the same segment 'segments' will appear at the same time in the subtitles generated with the ASS files. If you find that your segments are not the right size, you can use set `ass_file_config.resegment_text_to_fill_space=true` and specify some number of `ass_file_config.max_lines_per_segment`.
 
+### `ASSFileConfig` parameters
+The `ASSFileConfig` (which is passed into the main NFA config) has the following parameters:
+* `fontsize`: int (default value `20`) which will be the fontsize of the text
+* `vertical_alignment`: string (default value `center`) to specify the vertical alignment of the text. Can be one of `center`, `top`, `bottom`.
+* `resegment_text_to_fill_space`: bool (default value `False`). If `True`, the text will be resegmented such that each segment will not take up more than (approximately) `max_lines_per_segment` when the ASS file is applied to a video.
+* `max_lines_per_segment`: int (defaulst value `2`) which specifies the number of lines per segment to display. This parameter is only used if `resegment_text_to_fill_space` is `True`.
+* `text_already_spoken_rgb`: List of 3 ints (default value is [49, 46, 61], which makes a dark gray). The RGB values of the color that will be used to highlight text that has already been spoken.
+* `text_being_spoken_rgb`: List of 3 ints (default value is [57, 171, 9] which makes a dark green). The RGB values of the color that will be used to highlight text that is being spoken.
+* `text_not_yet_spoken_rgb`: List of 3 ints (default value is [194, 193, 199] which makes a dark green). The RGB values of the color that will be used to highlight text that has not yet been spoken.
 
 # Output JSON manifest file format
 A new manifest file will be saved at `<output_dir>/<original manifest file name>_with_output_file_paths.json`. It will contain the same fields as the original manifest, and additionally:
diff --git a/tools/nemo_forced_aligner/align.py b/tools/nemo_forced_aligner/align.py
index 296c4a009cc4..77ab3111fd91 100644
--- a/tools/nemo_forced_aligner/align.py
+++ b/tools/nemo_forced_aligner/align.py
@@ -110,12 +110,15 @@ class CTMFileConfig:
 @dataclass
 class ASSFileConfig:
     fontsize: int = 20
-    marginv: int = 20
+    vertical_alignment: str = "center"
     # if resegment_text_to_fill_space is True, the ASS files will use new segments
     # such that each segment will not take up more than (approximately) max_lines_per_segment
     # when the ASS file is applied to a video
     resegment_text_to_fill_space: bool = False
     max_lines_per_segment: int = 2
+    text_already_spoken_rgb: List[int] = field(default_factory=lambda: [49, 46, 61])  # dark gray
+    text_being_spoken_rgb: List[int] = field(default_factory=lambda: [57, 171, 9])  # dark green
+    text_not_yet_spoken_rgb: List[int] = field(default_factory=lambda: [194, 193, 199])  # light gray
 
 
 @dataclass
@@ -180,6 +183,22 @@ def main(cfg: AlignmentConfig):
     if cfg.ctm_file_config.minimum_timestamp_duration < 0:
         raise ValueError("cfg.minimum_timestamp_duration cannot be a negative number")
 
+    if cfg.ass_file_config.vertical_alignment not in ["top", "center", "bottom"]:
+        raise ValueError("cfg.ass_file_config.vertical_alignment must be one of 'top', 'center' or 'bottom'")
+
+    for rgb_list in [
+        cfg.ass_file_config.text_already_spoken_rgb,
+        cfg.ass_file_config.text_already_spoken_rgb,
+        cfg.ass_file_config.text_already_spoken_rgb,
+    ]:
+        if len(rgb_list) != 3:
+            raise ValueError(
+                "cfg.ass_file_config.text_already_spoken_rgb,"
+                " cfg.ass_file_config.text_being_spoken_rgb,"
+                " and cfg.ass_file_config.text_already_spoken_rgb all need to contain"
+                " exactly 3 elements."
+            )
+
     # Validate manifest contents
     if not is_entry_in_all_lines(cfg.manifest_filepath, "audio_filepath"):
         raise RuntimeError(
diff --git a/tools/nemo_forced_aligner/utils/make_ass_files.py b/tools/nemo_forced_aligner/utils/make_ass_files.py
index f1beea838573..fa5956f3714b 100644
--- a/tools/nemo_forced_aligner/utils/make_ass_files.py
+++ b/tools/nemo_forced_aligner/utils/make_ass_files.py
@@ -32,6 +32,7 @@
 PLAYERRESY = 288
 MARGINL = 10
 MARGINR = 10
+MARGINV = 20
 
 
 def seconds_to_ass_format(seconds_float):
@@ -56,6 +57,11 @@ def seconds_to_ass_format(seconds_float):
     return srt_format_time
 
 
+def rgb_list_to_hex_bgr(rgb_list):
+    r, g, b = rgb_list
+    return f"{b:x}{g:x}{r:x}"
+
+
 def make_ass_files(
     utt_obj, output_dir_root, ass_file_config,
 ):
@@ -107,7 +113,7 @@ def resegment_utt_obj(utt_obj, ass_file_config):
     approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / (
         ass_file_config.fontsize * 0.6
     )  # assume chars 0.6 as wide as they are tall
-    approx_lines_per_segment = (PLAYERRESY - ass_file_config.marginv) / (
+    approx_lines_per_segment = (PLAYERRESY - MARGINV) / (
         ass_file_config.fontsize * 1.15
     )  # assume line spacing is 1.15
     if approx_lines_per_segment > ass_file_config.max_lines_per_segment:
@@ -183,17 +189,30 @@ def make_word_level_ass_file(
         "BorderStyle": "1",
         "Outline": "1",
         "Shadow": "0",
-        "Alignment": "2",
+        "Alignment": None,  # will specify below
         "MarginL": str(MARGINL),
         "MarginR": str(MARGINR),
-        "MarginV": str(ass_file_config.marginv),
+        "MarginV": str(MARGINV),
         "Encoding": "0",
     }
 
+    if ass_file_config.vertical_alignment == "top":
+        default_style_dict["Alignment"] = "8"  # text will be 'center-justified' and in the top of the screen
+    elif ass_file_config.vertical_alignment == "center":
+        default_style_dict["Alignment"] = "5"  # text will be 'center-justified' and in the middle of the screen
+    elif ass_file_config.vertical_alignment == "bottom":
+        default_style_dict["Alignment"] = "2"  # text will be 'center-justified' and in the bottom of the screen
+    else:
+        raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment")
+
     output_dir = os.path.join(output_dir_root, "ass", "words")
     os.makedirs(output_dir, exist_ok=True)
     output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass")
 
+    already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}"
+    being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}"
+    not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}"
+
     with open(output_file, 'w') as f:
         default_style_top_line = "Format: " + ", ".join(default_style_dict.keys())
         default_style_bottom_line = "Style: " + ",".join(default_style_dict.values())
@@ -225,7 +244,7 @@ def make_word_level_ass_file(
                         words_in_first_segment.append(word_or_token)
                 break
 
-        text_before_speech = r"{\c&c7c1c2&}" + " ".join([x.text for x in words_in_first_segment]) + r"{\r}"
+        text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}"
         subtitle_text = (
             f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,,"
             + text_before_speech.rstrip()
@@ -247,16 +266,16 @@ def make_word_level_ass_file(
                     text_before = " ".join([x.text for x in words_in_segment[:word_i]])
                     if text_before != "":
                         text_before += " "
-                    text_before = r"{\c&H3d2e31&}" + text_before + r"{\r}"
+                    text_before = already_spoken_color_code + text_before + r"{\r}"
 
                     if word_i < len(words_in_segment) - 1:
                         text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]])
                     else:
                         text_after = ""
-                    text_after = r"{\c&c7c1c2&}" + text_after + r"{\r}"
+                    text_after = not_yet_spoken_color_code + text_after + r"{\r}"
 
-                    aligned_text = r"{\c&H09ab39&}" + word.text + r"{\r}"
-                    aligned_text_off = r"{\c&H3d2e31&}" + word.text + r"{\r}"
+                    aligned_text = being_spoken_color_code + word.text + r"{\r}"
+                    aligned_text_off = already_spoken_color_code + word.text + r"{\r}"
 
                     subtitle_text = (
                         f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,,"
@@ -307,17 +326,30 @@ def make_token_level_ass_file(
         "BorderStyle": "1",
         "Outline": "1",
         "Shadow": "0",
-        "Alignment": "2",
+        "Alignment": None,  # will specify below
         "MarginL": str(MARGINL),
         "MarginR": str(MARGINR),
-        "MarginV": str(ass_file_config.marginv),
+        "MarginV": str(MARGINV),
         "Encoding": "0",
     }
 
+    if ass_file_config.vertical_alignment == "top":
+        default_style_dict["Alignment"] = "8"  # text will be 'center-justified' and in the top of the screen
+    elif ass_file_config.vertical_alignment == "center":
+        default_style_dict["Alignment"] = "5"  # text will be 'center-justified' and in the middle of the screen
+    elif ass_file_config.vertical_alignment == "bottom":
+        default_style_dict["Alignment"] = "2"  # text will be 'center-justified' and in the bottom of the screen
+    else:
+        raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment")
+
     output_dir = os.path.join(output_dir_root, "ass", "tokens")
     os.makedirs(output_dir, exist_ok=True)
     output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass")
 
+    already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}"
+    being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}"
+    not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}"
+
     with open(output_file, 'w') as f:
         default_style_top_line = "Format: " + ", ".join(default_style_dict.keys())
         default_style_bottom_line = "Style: " + ",".join(default_style_dict.values())
@@ -360,7 +392,9 @@ def make_token_level_ass_file(
             )  # replace underscores used in subword tokens with spaces
             token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ")  # space token with actual space
 
-        text_before_speech = r"{\c&c7c1c2&}" + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}"
+        text_before_speech = (
+            not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}"
+        )
         subtitle_text = (
             f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,,"
             + text_before_speech.rstrip()
@@ -391,16 +425,16 @@ def make_token_level_ass_file(
                 for token_i, token in enumerate(tokens_in_segment):
 
                     text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]])
-                    text_before = r"{\c&H3d2e31&}" + text_before + r"{\r}"
+                    text_before = already_spoken_color_code + text_before + r"{\r}"
 
                     if token_i < len(tokens_in_segment) - 1:
                         text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]])
                     else:
                         text_after = ""
-                    text_after = r"{\c&c7c1c2&}" + text_after + r"{\r}"
+                    text_after = not_yet_spoken_color_code + text_after + r"{\r}"
 
-                    aligned_text = r"{\c&H09ab39&}" + token.text_cased + r"{\r}"
-                    aligned_text_off = r"{\c&H3d2e31&}" + token.text_cased + r"{\r}"
+                    aligned_text = being_spoken_color_code + token.text_cased + r"{\r}"
+                    aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}"
 
                     subtitle_text = (
                         f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,,"