From b498d438fc4c35ebf364a9a1c5cd3e29a2c0fe50 Mon Sep 17 00:00:00 2001 From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Date: Thu, 3 Aug 2023 14:41:09 -0700 Subject: [PATCH] NFA subtitle file config - specify colors and vertical alignment (#7160) * allow specifying colors of text in ASS subtitle file Signed-off-by: Elena Rastorgueva * specify vertical_alignment instead of marginv in ass_file_config Signed-off-by: Elena Rastorgueva * add documentation of CTMFileConfig and ASSFileConfig to NFA README Signed-off-by: Elena Rastorgueva --------- Signed-off-by: Elena Rastorgueva --- tools/nemo_forced_aligner/README.md | 14 ++++ tools/nemo_forced_aligner/align.py | 21 +++++- .../utils/make_ass_files.py | 64 ++++++++++++++----- 3 files changed, 83 insertions(+), 16 deletions(-) diff --git a/tools/nemo_forced_aligner/README.md b/tools/nemo_forced_aligner/README.md index 423c76878db6..1fe9097db3d6 100644 --- a/tools/nemo_forced_aligner/README.md +++ b/tools/nemo_forced_aligner/README.md @@ -82,12 +82,26 @@ Each CTM file will contain lines of the format: ` 1 `. Note the second item in the line (the 'channel ID', which is required by the CTM file format) is always 1, as NFA operates on single channel audio. +### `CTMFileConfig` parameters +The `CTMFileConfig` (which is passed into the main NFA config) has the following parameters: +* `remove_blank_tokens`: bool (default `False`) to specify if the token-level CTM files should have the timestamps of the blank tokens removed. +* `minimum_timestamp_duration`: float (default `0`) to specify the minimum duration that will be applied to all timestamps. If any line in the CTM has a duration lower than this, it will be enlarged from the middle outwards until it meets the `minimum_timestamp_duration`, or reaches the beginning or end of the audio file. Note that using a non-zero value may cause timestamps to overlap. + # Output ASS file format NFA will produce the following ASS files, which you can use to generate subtitle videos: * ASS files with token-level highlighting will be at `/ass/tokens/.ass,` * ASS files with word-level highlighting will be at `/ass/words/.ass`. All words belonging to the same segment 'segments' will appear at the same time in the subtitles generated with the ASS files. If you find that your segments are not the right size, you can use set `ass_file_config.resegment_text_to_fill_space=true` and specify some number of `ass_file_config.max_lines_per_segment`. +### `ASSFileConfig` parameters +The `ASSFileConfig` (which is passed into the main NFA config) has the following parameters: +* `fontsize`: int (default value `20`) which will be the fontsize of the text +* `vertical_alignment`: string (default value `center`) to specify the vertical alignment of the text. Can be one of `center`, `top`, `bottom`. +* `resegment_text_to_fill_space`: bool (default value `False`). If `True`, the text will be resegmented such that each segment will not take up more than (approximately) `max_lines_per_segment` when the ASS file is applied to a video. +* `max_lines_per_segment`: int (defaulst value `2`) which specifies the number of lines per segment to display. This parameter is only used if `resegment_text_to_fill_space` is `True`. +* `text_already_spoken_rgb`: List of 3 ints (default value is [49, 46, 61], which makes a dark gray). The RGB values of the color that will be used to highlight text that has already been spoken. +* `text_being_spoken_rgb`: List of 3 ints (default value is [57, 171, 9] which makes a dark green). The RGB values of the color that will be used to highlight text that is being spoken. +* `text_not_yet_spoken_rgb`: List of 3 ints (default value is [194, 193, 199] which makes a dark green). The RGB values of the color that will be used to highlight text that has not yet been spoken. # Output JSON manifest file format A new manifest file will be saved at `/_with_output_file_paths.json`. It will contain the same fields as the original manifest, and additionally: diff --git a/tools/nemo_forced_aligner/align.py b/tools/nemo_forced_aligner/align.py index 296c4a009cc4..77ab3111fd91 100644 --- a/tools/nemo_forced_aligner/align.py +++ b/tools/nemo_forced_aligner/align.py @@ -110,12 +110,15 @@ class CTMFileConfig: @dataclass class ASSFileConfig: fontsize: int = 20 - marginv: int = 20 + vertical_alignment: str = "center" # if resegment_text_to_fill_space is True, the ASS files will use new segments # such that each segment will not take up more than (approximately) max_lines_per_segment # when the ASS file is applied to a video resegment_text_to_fill_space: bool = False max_lines_per_segment: int = 2 + text_already_spoken_rgb: List[int] = field(default_factory=lambda: [49, 46, 61]) # dark gray + text_being_spoken_rgb: List[int] = field(default_factory=lambda: [57, 171, 9]) # dark green + text_not_yet_spoken_rgb: List[int] = field(default_factory=lambda: [194, 193, 199]) # light gray @dataclass @@ -180,6 +183,22 @@ def main(cfg: AlignmentConfig): if cfg.ctm_file_config.minimum_timestamp_duration < 0: raise ValueError("cfg.minimum_timestamp_duration cannot be a negative number") + if cfg.ass_file_config.vertical_alignment not in ["top", "center", "bottom"]: + raise ValueError("cfg.ass_file_config.vertical_alignment must be one of 'top', 'center' or 'bottom'") + + for rgb_list in [ + cfg.ass_file_config.text_already_spoken_rgb, + cfg.ass_file_config.text_already_spoken_rgb, + cfg.ass_file_config.text_already_spoken_rgb, + ]: + if len(rgb_list) != 3: + raise ValueError( + "cfg.ass_file_config.text_already_spoken_rgb," + " cfg.ass_file_config.text_being_spoken_rgb," + " and cfg.ass_file_config.text_already_spoken_rgb all need to contain" + " exactly 3 elements." + ) + # Validate manifest contents if not is_entry_in_all_lines(cfg.manifest_filepath, "audio_filepath"): raise RuntimeError( diff --git a/tools/nemo_forced_aligner/utils/make_ass_files.py b/tools/nemo_forced_aligner/utils/make_ass_files.py index f1beea838573..fa5956f3714b 100644 --- a/tools/nemo_forced_aligner/utils/make_ass_files.py +++ b/tools/nemo_forced_aligner/utils/make_ass_files.py @@ -32,6 +32,7 @@ PLAYERRESY = 288 MARGINL = 10 MARGINR = 10 +MARGINV = 20 def seconds_to_ass_format(seconds_float): @@ -56,6 +57,11 @@ def seconds_to_ass_format(seconds_float): return srt_format_time +def rgb_list_to_hex_bgr(rgb_list): + r, g, b = rgb_list + return f"{b:x}{g:x}{r:x}" + + def make_ass_files( utt_obj, output_dir_root, ass_file_config, ): @@ -107,7 +113,7 @@ def resegment_utt_obj(utt_obj, ass_file_config): approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / ( ass_file_config.fontsize * 0.6 ) # assume chars 0.6 as wide as they are tall - approx_lines_per_segment = (PLAYERRESY - ass_file_config.marginv) / ( + approx_lines_per_segment = (PLAYERRESY - MARGINV) / ( ass_file_config.fontsize * 1.15 ) # assume line spacing is 1.15 if approx_lines_per_segment > ass_file_config.max_lines_per_segment: @@ -183,17 +189,30 @@ def make_word_level_ass_file( "BorderStyle": "1", "Outline": "1", "Shadow": "0", - "Alignment": "2", + "Alignment": None, # will specify below "MarginL": str(MARGINL), "MarginR": str(MARGINR), - "MarginV": str(ass_file_config.marginv), + "MarginV": str(MARGINV), "Encoding": "0", } + if ass_file_config.vertical_alignment == "top": + default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen + elif ass_file_config.vertical_alignment == "center": + default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen + elif ass_file_config.vertical_alignment == "bottom": + default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen + else: + raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") + output_dir = os.path.join(output_dir_root, "ass", "words") os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") + already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" + being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" + not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" + with open(output_file, 'w') as f: default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) @@ -225,7 +244,7 @@ def make_word_level_ass_file( words_in_first_segment.append(word_or_token) break - text_before_speech = r"{\c&c7c1c2&}" + " ".join([x.text for x in words_in_first_segment]) + r"{\r}" + text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}" subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,," + text_before_speech.rstrip() @@ -247,16 +266,16 @@ def make_word_level_ass_file( text_before = " ".join([x.text for x in words_in_segment[:word_i]]) if text_before != "": text_before += " " - text_before = r"{\c&H3d2e31&}" + text_before + r"{\r}" + text_before = already_spoken_color_code + text_before + r"{\r}" if word_i < len(words_in_segment) - 1: text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]]) else: text_after = "" - text_after = r"{\c&c7c1c2&}" + text_after + r"{\r}" + text_after = not_yet_spoken_color_code + text_after + r"{\r}" - aligned_text = r"{\c&H09ab39&}" + word.text + r"{\r}" - aligned_text_off = r"{\c&H3d2e31&}" + word.text + r"{\r}" + aligned_text = being_spoken_color_code + word.text + r"{\r}" + aligned_text_off = already_spoken_color_code + word.text + r"{\r}" subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,," @@ -307,17 +326,30 @@ def make_token_level_ass_file( "BorderStyle": "1", "Outline": "1", "Shadow": "0", - "Alignment": "2", + "Alignment": None, # will specify below "MarginL": str(MARGINL), "MarginR": str(MARGINR), - "MarginV": str(ass_file_config.marginv), + "MarginV": str(MARGINV), "Encoding": "0", } + if ass_file_config.vertical_alignment == "top": + default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen + elif ass_file_config.vertical_alignment == "center": + default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen + elif ass_file_config.vertical_alignment == "bottom": + default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen + else: + raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") + output_dir = os.path.join(output_dir_root, "ass", "tokens") os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") + already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" + being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" + not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" + with open(output_file, 'w') as f: default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) @@ -360,7 +392,9 @@ def make_token_level_ass_file( ) # replace underscores used in subword tokens with spaces token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space - text_before_speech = r"{\c&c7c1c2&}" + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}" + text_before_speech = ( + not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}" + ) subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,," + text_before_speech.rstrip() @@ -391,16 +425,16 @@ def make_token_level_ass_file( for token_i, token in enumerate(tokens_in_segment): text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]]) - text_before = r"{\c&H3d2e31&}" + text_before + r"{\r}" + text_before = already_spoken_color_code + text_before + r"{\r}" if token_i < len(tokens_in_segment) - 1: text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]]) else: text_after = "" - text_after = r"{\c&c7c1c2&}" + text_after + r"{\r}" + text_after = not_yet_spoken_color_code + text_after + r"{\r}" - aligned_text = r"{\c&H09ab39&}" + token.text_cased + r"{\r}" - aligned_text_off = r"{\c&H3d2e31&}" + token.text_cased + r"{\r}" + aligned_text = being_spoken_color_code + token.text_cased + r"{\r}" + aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}" subtitle_text = ( f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,,"