better handling and logging of empty sharegpt turns (#603)

axolotl-ai-cloud · Sep 22, 2023 · a363604 · a363604
1 parent 501958b
commit a363604
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 14 deletions.
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
@@ -358,10 +358,12 @@ def tokenize_prompt(self, prompt):
             ):
                 if isinstance(part, tuple):
                     if part[0] == "USER:":
-                        part = part[0] + part[1] if not user_token else part[1]
+                        turn = part[0] + part[1] if not user_token else part[1]
                         # this is still the user query, we should
+                        if not part[1].strip():
+                            LOG.warning(f"user turn has empty text: {prompt}")
                         res = self._tokenize(
-                            part.strip(),
+                            turn.strip(),
                             add_eos_token=False,
                             strip_bos_token=True,
                         )
@@ -371,10 +373,12 @@ def tokenize_prompt(self, prompt):
                         labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                     elif part[0] == "ASSISTANT:":
                         # TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
-                        part = part[0] + part[1] if not assistant_token else part[1]
-                        # this should be the assistent response, should end with an eos token
+                        turn = part[0] + part[1] if not assistant_token else part[1]
+                        # this should be the assistant response, should end with an eos token
+                        if not part[1].strip():
+                            LOG.warning(f"assistant turn has empty text: {prompt}")
                         res = self._tokenize(
-                            part.strip(),
+                            turn.strip(),
                             add_eos_token=True,
                             strip_bos_token=True,
                         )
@@ -409,22 +413,31 @@ def tokenize_prompt(self, prompt):
             raise InvalidDataException(str(err)) from err
 
     def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
-        result = self.tokenizer(
-            prompt,
-            truncation=True,
-            max_length=self.sequence_len,
-            padding=False,
-            return_tensors=None,
-        )
+        if not prompt.strip():
+            LOG.warning("Empty text requested for tokenization.")
+            result = BatchEncoding(data={"input_ids": [], "attention_mask": []})
+        else:
+            result = self.tokenizer(
+                prompt,
+                truncation=True,
+                max_length=self.sequence_len,
+                padding=False,
+                return_tensors=None,
+            )
         if (
-            result["input_ids"][-1] != self.tokenizer.eos_token_id
+            len(result["input_ids"]) > 0
+            and result["input_ids"][-1] != self.tokenizer.eos_token_id
             and len(result["input_ids"]) < self.sequence_len
             and add_eos_token
         ):
             result["input_ids"].append(self.tokenizer.eos_token_id)
             result["attention_mask"].append(1)
 
-        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
+        if (
+            len(result["input_ids"]) > 0
+            and result["input_ids"][0] == self.tokenizer.bos_token_id
+            and strip_bos_token
+        ):
             result["input_ids"] = result["input_ids"][1:]
             result["attention_mask"] = result["attention_mask"][1:]
 

diff --git a/tests/fixtures/conversation.missingturns.json b/tests/fixtures/conversation.missingturns.json
@@ -0,0 +1,52 @@
+{
+  "conversations": [
+    {
+      "from": "human",
+      "value": "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?"
+    },
+    {
+      "from": "gpt",
+      "value": ""
+    },
+    {
+      "from": "human",
+      "value": "could you give me the gist of how it could though?"
+    },
+    {
+      "from": "gpt",
+      "value": "Sure. The basic idea would be to use algebraic topology to analyze the structure of the Minesweeper game board and determine the locations of mines based on certain topological invariants. This could involve representing the game board as a topological space and using topological invariants such as homotopy groups and Betti numbers to infer information about the location of mines on the board. However, as I mentioned before, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper, so it's not something that can be explained in a simple answer."
+    },
+    {
+      "from": "human",
+      "value": "give me a specific example"
+    },
+    {
+      "from": "gpt",
+      "value": "As a specific example, suppose we have a Minesweeper game board with the following configuration:\n```\n 1  1  1  1  1\n 1  ?  ?  ?  1\n 1  ?  ?  ?  1\n 1  ?  ?  ?  1\n 1  1  1  1  1\n```\nIn this case, the game board can be represented as a topological space, where each square on the board corresponds to a point in the space. The mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space. Using algebraic topology, we can analyze the structure of this space and determine the locations of the mines based on certain topological invariants.\n\nFor instance, one way to do this is to compute the homotopy groups of the space. Homotopy groups are a way of describing the holes in a space, and they can be used to infer information about the locations of the mines on the Minesweeper board. In the case of the board above, we can compute the first homotopy group of the space, which describes the number and \"shape\" of the holes in the space. In this case, the first homotopy group would tell us that there are four holes in the space, arranged in a square shape.\n\nBased on this information, we can infer that the mines in the game are located in the four corners of the board, as shown below:\n```\n 1  1  1  1  1\n 1  M  ?  ?  1\n 1  ?  ?  ?  1\n 1  ?  ?  M  1\n 1  1  1  1  1\n```\nOf course, this is just one example of how algebraic topology could be used to solve Minesweeper. There are many other possible approaches, and the specific details of how to implement them would depend on the specific game board and the topological invariants being used. As I mentioned before, implementing a solution like this would require a deep understanding of both algebraic topology and the game of Minesweeper."
+    },
+    {
+      "from": "human",
+      "value": "assuming i was someone with a deep understanding of both, could you explain the same concept to me with that assumption?"
+    },
+    {
+      "from": "gpt",
+      "value": "Sure. Assuming you have a deep understanding of both algebraic topology and the game of Minesweeper, here is how you might use algebraic topology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Use topological invariants to infer information about the locations of the mines on the board. There are many possible topological invariants that could be used for this, such as homotopy groups, Betti numbers, or persistent homology.\n3. Use the information inferred from the topological invariants to determine the locations of the mines on the board. For example, if the first homotopy group of the space indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using algebraic topology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the topological invariants being used. However, this should give you a general idea of how algebraic topology can be used to solve Minesweeper."
+    },
+    {
+      "from": "human",
+      "value": "how would i do it with persistent homology or any of the other topological invariants?"
+    },
+    {
+      "from": "gpt",
+      "value": "To use persistent homology or any of the other topological invariants to solve a Minesweeper game board, you would follow the same basic steps as described above, but with some differences in the specific details.\n\nFor instance, here is how you might use persistent homology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Compute the persistent homology of the space. Persistent homology is a way of describing the holes in a space over time, and it can be used to infer information about the locations of the mines on the Minesweeper board.\n3. Use the information from the persistent homology to determine the locations of the mines on the board. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using persistent homology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the specific details of the persistent homology computation. However, this should give you a general idea of how persistent homology can be used to solve Minesweeper."
+    },
+    {
+      "from": "human",
+      "value": "how could i implement this in python?"
+    },
+    {
+      "from": "gpt",
+      "value": "To implement a Minesweeper solver that uses algebraic topology in Python, you would need to do the following:\n\n1. Install a Python package for algebraic topology. There are several options available, such as Gudhi, Dionysus, or scikit-tda. These packages provide algorithms and data structures for computing topological invariants, such as homotopy groups, Betti numbers, or persistent homology.\n2. Represent the Minesweeper game board as a topological space. This could be done using a data structure provided by the algebraic topology package, such as a simplicial complex or a Vietoris-Rips complex. Each square on the board would correspond to a point in the space, and the mines on the board would be represented as \"holes\" in the space.\n3. Compute the topological invariants of the space. This could be done using algorithms provided by the algebraic topology package. For example, you could compute the persistent homology of the space using the persistent homology algorithm.\n4. Use the information from the topological invariants to determine the locations of the mines on the board. This could be done using a combination of mathematical reasoning and programming. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then you could use a piece of code to infer that the mines are located in the four corners of the board.\n5. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you could use a piece of code to automatically click on the safe squares and clear the board.\n\nOf course, this is just one possible approach to implementing a Minesweeper solver that uses algebraic topology in Python. There may be other ways to do it, depending on the specific details of the implementation. However, this should give you a general idea of how to get started with such a project."
+    }
+  ]
+}
diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py
@@ -3,7 +3,9 @@
 import logging
 import unittest
 from pathlib import Path
+from typing import Optional
 
+import pytest
 from transformers import AutoTokenizer, LlamaTokenizer
 
 from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
@@ -29,6 +31,12 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
     Test class for prompt tokenization strategies.
     """
 
+    _caplog: Optional[pytest.LogCaptureFixture] = None
+
+    @pytest.fixture(autouse=True)
+    def inject_fixtures(self, caplog):
+        self._caplog = caplog
+
     def setUp(self) -> None:
         # pylint: disable=duplicate-code
         self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
@@ -64,6 +72,24 @@ def test_sharegpt_integration(self):
             self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
             self.assertEqual(example[fields], tokenized_conversation[fields])
 
+    def test_sharegpt_warnings_integration(self):
+        with open(
+            Path(__file__).parent / "fixtures/conversation.missingturns.json",
+            encoding="utf-8",
+        ) as fin:
+            data = fin.read()
+            conversation = json.loads(data)
+        prompter = ShareGPTPrompter("chat")
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            strat.tokenize_prompt(conversation)
+            assert "assistant turn has empty text" in self._caplog.records[1].message
+
     def test_no_sys_prompt(self):
         """
         tests the interface between the user and assistant parts