Merge branch 'main' into jiemingz/fp8_block

jiemingz · web-flow · commit efe09d729bd7 · 2025-08-20T20:28:54.000-04:00
diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py
@@ -548,13 +548,23 @@ def _format_content_helper(
                     message_chunk = tokenizer.bos_token + message_chunk
 
         if i == len(message_log_strs) - 1:
-            message_chunk = message_chunk.rstrip("\n")
+            r"""
+            This is an attempt to robustly append the eos token. The origin is Qwen
+            chat templates always append <eos>\n and some models like gemma do not
+            use the <eos> at all in the chat template. Adding a <eos> if the <eos> is
+            already at the end, is likely a user error, and since we know Qwen likes to
+            have <eos>\n we'll check for that case.
+
+            This makes the logic slightly more robust to the model family's chat template
+            so users don't need to know whether they need to add add_eos or not.
+            """
+            stripped_message_chunk = message_chunk.rstrip("\n")
             if add_eos_token:
                 if tokenizer.eos_token is None:
                     warnings.warn(
                         "add_eos_token is True but the tokenizer does not have an EOS token. Skipping EOS token addition."
                     )
-                elif not message_chunk.endswith(tokenizer.eos_token):
+                elif not stripped_message_chunk.endswith(tokenizer.eos_token):
                     message_chunk += tokenizer.eos_token
 
         # get images too (extend this for other modalities)
diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+from typing import Any, Callable
+
 import pytest
 import torch
 from PIL import Image
@@ -329,177 +331,124 @@ def test_batch_pad_message_log_custom_pad_value(
     )
 
 
-@pytest.mark.hf_gated
-def test_get_formatted_message_log_llama(
-    raw_chat_message_log: LLMMessageLogType,
-) -> None:
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-
-    ## get expected result
-    formatted_system_message = tokenizer.apply_chat_template(
-        [raw_chat_message_log[0]],
-        tokenize=False,
-        add_generation_prompt=False,
-        add_special_tokens=False,
-    )
-    formatted_user_message = tokenizer.apply_chat_template(
-        [raw_chat_message_log[1]],
-        tokenize=False,
-        add_generation_prompt=False,
-        add_special_tokens=False,
-    )
-    formatted_assistant_message = tokenizer.apply_chat_template(
-        [raw_chat_message_log[2]],
-        tokenize=False,
-        add_generation_prompt=False,
-        add_special_tokens=False,
-    )
-
-    ## text should be equivalent to if we apply chat template
-    ## to each turn separately and manually remove the bot string
-    ## from the intermediate turns
-    bot_str = "<|begin_of_text|>"
-    expected_text = [
-        formatted_system_message,
-        formatted_user_message[len(bot_str) :],
-        formatted_assistant_message[len(bot_str) :],
-    ]
-
-    task_data_spec = TaskDataSpec(
-        task_name="test",
-    )
-    result = get_formatted_message_log(raw_chat_message_log, tokenizer, task_data_spec)
-    actual_text = [m["content"] for m in result]
-
-    assert actual_text == expected_text
-
-
-@pytest.mark.hf_gated
-def test_get_formatted_message_log_add_generation_prompt_llama(
+@pytest.mark.parametrize(
+    "model_id, chat_log_transform",
+    [
+        pytest.param(
+            "meta-llama/Meta-Llama-3-8B-Instruct",
+            lambda raw: raw,
+            marks=pytest.mark.hf_gated,
+            id="llama",
+        ),
+        pytest.param(
+            "google/gemma-3-27b-it",
+            # Some Gemma chat templates (or versions) raise on system turns.
+            # For portability across environments, test on user+assistant only.
+            # If your tokenizer supports system turns, you can change this to `lambda raw: raw`.
+            lambda raw: [raw[1], raw[2]],
+            marks=pytest.mark.hf_gated,
+            id="gemma",
+        ),
+        pytest.param(
+            "Qwen/Qwen2.5-Coder-32B-Instruct",
+            lambda raw: raw,
+            id="qwen",
+        ),
+    ],
+)
+@pytest.mark.parametrize("add_generation_prompt", [False, True])
+def test_get_formatted_message_log_models(
     raw_chat_message_log: LLMMessageLogType,
+    model_id: str,
+    chat_log_transform: Callable[[Any], Any],
+    add_generation_prompt: bool,
 ) -> None:
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-
-    ## get expected result
-    formatted_system_message = tokenizer.apply_chat_template(
-        [raw_chat_message_log[0]],
-        tokenize=False,
-        add_generation_prompt=False,
-        add_special_tokens=False,
-    )
-    formatted_user_message = tokenizer.apply_chat_template(
-        [raw_chat_message_log[1]],
-        tokenize=False,
-        add_generation_prompt=True,
-        add_special_tokens=False,
-    )
-    formatted_assistant_message = (
-        raw_chat_message_log[2]["content"] + tokenizer.eos_token
-    )
-
-    ## text should be equivalent to if we apply chat template
-    ## to each turn separately and manually remove the bot string
-    ## from the intermediate turns
-    bot_str = "<|begin_of_text|>"
-    expected_text = [
-        formatted_system_message,
-        formatted_user_message[len(bot_str) :],
-        formatted_assistant_message,
-    ]
-
-    task_data_spec = TaskDataSpec(
-        task_name="test",
-    )
+    """Validate that get_formatted_message_log produces text consistent with the
+    tokenizer's chat template across models.
+
+    This test is parametrized over model/tokenizer and whether to include a
+    generation prompt. For models like Gemma that error on system turns, the
+    input chat log is transformed to exclude the system message.
+
+    Expectations:
+    - Require an EOS token for well-defined end-of-turn comparison.
+    - When add_generation_prompt is False, the concatenated contents must match
+      the tokenizer's apply_chat_template output; if the tokenizer omits a final
+      EOS, accept the actual with EOS by appending EOS to the expected before
+      comparison.
+    - When add_generation_prompt is True and the last turn is an assistant
+      message, accept either:
+        (1) prefix built with add_generation_prompt=True followed by the raw
+            assistant content plus EOS; or
+        (2) the tokenizer's full non-generation template output plus EOS.
+      This avoids hard-coding model-specific headers or delimiters while still
+      verifying semantic equivalence.
+    - Only normalization performed is trimming a trailing newline after EOS.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    chat_log = chat_log_transform(raw_chat_message_log)
+    # Ensure tokenizer defines an EOS token; otherwise the test logic is ill-defined
+    assert tokenizer.eos_token, "Tokenizer must define eos_token for this test"
+    eos = tokenizer.eos_token
+    task_data_spec = TaskDataSpec(task_name="test")
     result = get_formatted_message_log(
-        raw_chat_message_log,
+        chat_log,
         tokenizer,
         task_data_spec,
-        add_generation_prompt=True,
-    )
-    actual_text = [m["content"] for m in result]
-
-    assert actual_text == expected_text
-
-
-def test_get_formatted_message_log_qwen(
-    raw_chat_message_log: LLMMessageLogType,
-) -> None:
-    ## test using a tokenizer that does not have a bos token
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")
-    assert tokenizer.bos_token is None
-
-    ## get expected result
-    ## result is equivalent to if we apply chat template to the full message log,
-    ## remove the trailing newline, and then partition by the delimiter
-    expected_text_string = tokenizer.apply_chat_template(
-        [raw_chat_message_log],
-        tokenize=False,
-        add_generation_prompt=False,
-        add_special_tokens=False,
-    )[0].rstrip("\n")  ## remove trailing newline
-
-    delimiter = "<|im_end|>\n"
-    split_text = expected_text_string.split(delimiter)
-    expected_text = []
-    for i in range(len(split_text)):
-        if i == len(raw_chat_message_log) - 1:
-            expected_text.append(split_text[i])
+        add_generation_prompt=add_generation_prompt,
+    )
+    actual_concat = "".join(m["content"] for m in result)
+
+    def normalize(s: str) -> str:
+        # Normalize EOS+newline quirk to EOS only
+        if s.endswith(eos + "\n"):
+            return s[:-1]
+        return s
+
+    if not add_generation_prompt:
+        expected_concat = tokenizer.apply_chat_template(
+            [chat_log],
+            tokenize=False,
+            add_generation_prompt=False,
+            add_special_tokens=False,
+        )[0]
+        # Accept EOS presence even if the tokenizer's template omits it
+        if actual_concat.endswith(eos) and not expected_concat.endswith(eos):
+            expected_concat = expected_concat + eos
+        assert normalize(actual_concat) == normalize(expected_concat)
+    else:
+        if len(chat_log) > 0 and chat_log[-1].get("role") == "assistant":
+            prefix_log = chat_log[:-1]
+            # Some tokenizers include a role header when add_generation_prompt=True.
+            # Accept either behavior without hard-coding model-specific strings.
+            prefix_gen = tokenizer.apply_chat_template(
+                [prefix_log],
+                tokenize=False,
+                add_generation_prompt=True,
+                add_special_tokens=False,
+            )[0]
+            assistant_suffix = chat_log[-1]["content"] + eos
+            expected_concat_a = prefix_gen + assistant_suffix
+            # Alternative: take the full non-generation template output and just append EOS
+            full_no_gen = tokenizer.apply_chat_template(
+                [chat_log],
+                tokenize=False,
+                add_generation_prompt=False,
+                add_special_tokens=False,
+            )[0]
+            expected_concat_b = full_no_gen + eos
+            actual_norm = normalize(actual_concat)
+            assert actual_norm == normalize(
+                expected_concat_a
+            ) or actual_norm == normalize(expected_concat_b)
         else:
-            expected_text.append(split_text[i] + delimiter)
-
-    task_data_spec = TaskDataSpec(
-        task_name="test",
-    )
-    result = get_formatted_message_log(raw_chat_message_log, tokenizer, task_data_spec)
-    actual_text = [m["content"] for m in result]
-
-    assert actual_text == expected_text
-
-
-def test_get_formatted_message_log_add_generation_prompt_qwen(
-    raw_chat_message_log: LLMMessageLogType,
-) -> None:
-    ## test using a tokenizer that does not have a bos token
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")
-    assert tokenizer.bos_token is None
-
-    ## get expected result
-    ## result is equivalent to if we apply chat template to the full message log,
-    ## remove the trailing newline, and then partition by the delimiter
-    ## Separately handle the last message because of the generation prompt
-    expected_text_string = tokenizer.apply_chat_template(
-        [raw_chat_message_log[:2]],
-        tokenize=False,
-        add_generation_prompt=True,
-        add_special_tokens=False,
-    )[0]
-
-    delimiter = "<|im_end|>\n"
-    split_text = expected_text_string.split(delimiter, 1)
-    expected_text = []
-    for i in range(len(split_text)):
-        if i == len(split_text) - 1:
-            expected_text.append(split_text[i])
-        else:
-            expected_text.append(split_text[i] + delimiter)
-
-    formatted_assistant_message = (
-        raw_chat_message_log[2]["content"] + tokenizer.eos_token
-    )
-    expected_text.append(formatted_assistant_message)
-
-    task_data_spec = TaskDataSpec(
-        task_name="test",
-    )
-    result = get_formatted_message_log(
-        raw_chat_message_log,
-        tokenizer,
-        task_data_spec,
-        add_generation_prompt=True,
-    )
-    actual_text = [m["content"] for m in result]
-
-    assert actual_text == expected_text
+            expected_concat = tokenizer.apply_chat_template(
+                [chat_log],
+                tokenize=False,
+                add_generation_prompt=True,
+                add_special_tokens=False,
+            )[0]
+            assert normalize(actual_concat) == normalize(expected_concat)
 
 
 @pytest.mark.hf_gated