axolotl-ai-cloud · NanoCode012 · Jan 23, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/tests/prompt_strategies/test_alpacha.py b/tests/prompt_strategies/test_alpacha.py
@@ -0,0 +1,344 @@
+"""
+Test module for alpacha integration w chatml
+"""
+import pytest
+from datasets import Dataset
+from tokenizers import AddedToken
+from transformers import AutoTokenizer
+
+from axolotl.datasets import TokenizedPromptDataset
+from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
+from axolotl.prompters import AlpacaPrompter
+
+
+@pytest.fixture(name="alpacha_dataset")
+def fixture_alpacha_dataset():
+    return Dataset.from_list(
+        [
+            {
+                "instruction": "Evaluate this sentence for spelling and grammar mistakes",
+                "input": "He finnished his meal and left the resturant",
+                "output": "He finished his meal and left the restaurant.",
+            }
+        ]
+    )
+
+
+@pytest.fixture(name="tokenizer")
+def fixture_tokenizer():
+    # pylint: disable=all
+    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+    tokenizer.add_special_tokens(
+        {
+            "eos_token": AddedToken(
+                "<|im_end|>", rstrip=False, lstrip=False, normalized=False
+            )
+        }
+    )
+    tokenizer.add_tokens(
+        [
+            AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False),
+        ]
+    )
+
+    return tokenizer
+
+
+class TestAlpacha:
+    """
+    Test class for alpacha prompter
+    """
+
+    def test_no_double_im_end(self, alpacha_dataset, tokenizer):
+        strategy = AlpacaPromptTokenizingStrategy(
+            AlpacaPrompter(prompt_style="chatml"),
+            tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, alpacha_dataset, process_count=1
+        )
+
+        input_ids = dataset_wrapper[0]["input_ids"]
+
+        assert input_ids == [
+            1,  # Bos
+            32001,
+            1587,
+            13,
+            20548,
+            336,
+            349,
+            396,
+            13126,
+            369,
+            13966,
+            264,
+            3638,
+            28725,
+            5881,
+            1360,
+            395,
+            396,
+            2787,
+            369,
+            5312,
+            3629,
+            2758,
+            28723,
+            12018,
+            264,
+            2899,
+            369,
+            6582,
+            1999,
+            2691,
+            274,
+            272,
+            2159,
+            28723,
+            32000,
+            28705,
+            13,  # instruction
+            32001,
+            2188,
+            13,
+            16627,
+            11931,
+            456,
+            12271,
+            354,
+            668,
+            3572,
+            304,
+            18756,
+            3479,
+            17179,
+            13,
+            2428,
+            854,
+            28711,
+            1497,
+            516,
+            11314,
+            304,
+            1749,
+            272,
+            1846,
+            324,
+            440,
+            32000,
+            28705,
+            13,  # input
+            32001,
+            13892,
+            13,
+            650,
+            5967,
+            516,
+            11314,
+            304,
+            1749,
+            272,
+            9926,
+            28723,
+            32000,  # output
+        ]
+
+    def test_no_train_on_input(self, alpacha_dataset, tokenizer):
+        strategy = AlpacaPromptTokenizingStrategy(
+            AlpacaPrompter(prompt_style="chatml"),
+            tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, alpacha_dataset, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+
+        assert labels == [
+            -100,  # bos
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,  # instruction
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,
+            -100,  # input
+            -100,
+            -100,
+            -100,
+            650,
+            5967,
+            516,
+            11314,
+            304,
+            1749,
+            272,
+            9926,
+            28723,
+            32000,  # Output
+        ]
+
+    def test_w_train_on_input(self, alpacha_dataset, tokenizer):
+        strategy = AlpacaPromptTokenizingStrategy(
+            AlpacaPrompter(prompt_style="chatml"),
+            tokenizer,
+            True,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, alpacha_dataset, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+
+        assert labels == [
+            1,  # Bos
+            32001,
+            1587,
+            13,
+            20548,
+            336,
+            349,
+            396,
+            13126,
+            369,
+            13966,
+            264,
+            3638,
+            28725,
+            5881,
+            1360,
+            395,
+            396,
+            2787,
+            369,
+            5312,
+            3629,
+            2758,
+            28723,
+            12018,
+            264,
+            2899,
+            369,
+            6582,
+            1999,
+            2691,
+            274,
+            272,
+            2159,
+            28723,
+            32000,
+            28705,
+            13,  # instruction
+            32001,
+            2188,
+            13,
+            16627,
+            11931,
+            456,
+            12271,
+            354,
+            668,
+            3572,
+            304,
+            18756,
+            3479,
+            17179,
+            13,
+            2428,
+            854,
+            28711,
+            1497,
+            516,
+            11314,
+            304,
+            1749,
+            272,
+            1846,
+            324,
+            440,
+            32000,
+            28705,
+            13,  # input
+            32001,
+            13892,
+            13,
+            650,
+            5967,
+            516,
+            11314,
+            304,
+            1749,
+            272,
+            9926,
+            28723,
+            32000,  # output
+        ]