keras-team · mattdangerw · May 17, 2023 · May 17, 2023 · May 17, 2023 · May 17, 2023
diff --git a/keras_nlp/layers/multi_segment_packer.py b/keras_nlp/layers/multi_segment_packer.py
@@ -53,12 +53,16 @@ class MultiSegmentPacker(keras.layers.Layer):
 
     Args:
         sequence_length: The desired output length.
-        start_value: The id or token that is to be placed at the start of each
-            sequence (called "[CLS]" for BERT). The dtype must match the dtype
-            of the input tensors to the layer.
-        end_value: The id or token that is to be placed at the end of each
-            input segment (called "[SEP]" for BERT). The dtype much match the
+        start_value: The id(s) or token(s) that are to be placed at the start of
+            each sequence (called "[CLS]" for BERT). The dtype must match the
             dtype of the input tensors to the layer.
+        end_value: The id(s) or token(s) that is/are to be placed at the end of
+            the last input segment (called "[SEP]" for BERT). The dtype much
+            match the dtype of the input tensors to the layer.
+        sep_value: The id(s) or token(s) that is/are to be placed at the end of
+            every segment, except the last segment (called "[SEP]" for BERT).
+            If `None`, `end_value` is used. The dtype much match the dtype of
+            the input tensors to the layer.
         pad_value: The id or token that is to be placed into the unused
             positions after the last segment in the sequence
             (called "[PAD]" for BERT).
@@ -110,6 +114,7 @@ def __init__(
         sequence_length,
         start_value,
         end_value,
+        sep_value=None,
         pad_value=None,
         truncate="round_robin",
         **kwargs,
@@ -124,17 +129,37 @@ def __init__(
                 "supported. Received %s" % truncate
             )
         self.truncate = truncate
+
+        # Maintain private copies of start/end values for config purposes.
+        self._start_value = start_value
+        self._sep_value = sep_value
+        self._end_value = end_value
+
+        if not isinstance(start_value, (list, tuple)):
+            start_value = [start_value]
+
+        if sep_value is None:
+            sep_value = end_value
+        if not isinstance(sep_value, (list, tuple)):
+            sep_value = [sep_value]
+
+        if not isinstance(end_value, (list, tuple)):
+            end_value = [end_value]
+
         self.start_value = start_value
+        self.sep_value = sep_value
         self.end_value = end_value
+
         self.pad_value = pad_value
 
     def get_config(self):
         config = super().get_config()
         config.update(
             {
                 "sequence_length": self.sequence_length,
-                "start_value": self.start_value,
-                "end_value": self.end_value,
+                "start_value": self._start_value,
+                "end_value": self._end_value,
+                "sep_value": self._sep_value,
                 "pad_value": self.pad_value,
                 "truncate": self.truncate,
             }
@@ -170,7 +195,12 @@ def _convert_dense(self, x):
 
     def _trim_inputs(self, inputs):
         """Trim inputs to desired length."""
-        num_special_tokens = len(inputs) + 1
+        num_segments = len(inputs)
+        num_special_tokens = (
+            len(self.start_value)
+            + (num_segments - 1) * len(self.sep_value)
+            + len(self.end_value)
+        )
         if self.truncate == "round_robin":
             return tf_text.RoundRobinTrimmer(
                 self.sequence_length - num_special_tokens
@@ -187,22 +217,40 @@ def _combine_inputs(self, segments):
         dtype = segments[0].dtype
         batch_size = segments[0].nrows()
         start_value = tf.convert_to_tensor(self.start_value, dtype=dtype)
+        sep_value = tf.convert_to_tensor(self.sep_value, dtype=dtype)
         end_value = tf.convert_to_tensor(self.end_value, dtype=dtype)
 
-        start_column = tf.fill((batch_size, 1), start_value)
-        end_column = tf.fill((batch_size, 1), end_value)
-        ones_column = tf.ones_like(start_column, dtype=tf.int32)
+        start_values_tensor = tf.repeat(
+            start_value[tf.newaxis, :], repeats=batch_size, axis=0
+        )
+        end_values_tensor = tf.repeat(
+            end_value[tf.newaxis, :], repeats=batch_size, axis=0
+        )
+        sep_values_tensor = tf.repeat(
+            sep_value[tf.newaxis, :], repeats=batch_size, axis=0
+        )
+        ones_sep_tensor = tf.ones_like(sep_values_tensor, dtype=tf.int32)
+        ones_end_tensor = tf.ones_like(end_values_tensor, dtype=tf.int32)
+
+        segments_to_combine = [start_values_tensor]
+        segment_ids_to_combine = [
+            tf.ones_like(start_values_tensor, dtype=tf.int32) * 0
+        ]
 
-        segments_to_combine = [start_column]
-        segment_ids_to_combine = [ones_column * 0]
         for i, seg in enumerate(segments):
-            # Combine all segments adding end tokens.
+            # Combine all segments.
             segments_to_combine.append(seg)
-            segments_to_combine.append(end_column)
 
-            # Combine segment ids accounting for end tokens.
+            # Combine segment ids.
             segment_ids_to_combine.append(tf.ones_like(seg, dtype=tf.int32) * i)
-            segment_ids_to_combine.append(ones_column * i)
+
+            # Account for the sep/end tokens here.
+            if i == len(segments) - 1:
+                segments_to_combine.append(end_values_tensor)
+                segment_ids_to_combine.append(ones_end_tensor * i)
+            else:
+                segments_to_combine.append(sep_values_tensor)
+                segment_ids_to_combine.append(ones_sep_tensor * i)
 
         token_ids = tf.concat(segments_to_combine, 1)
         segment_ids = tf.concat(segment_ids_to_combine, 1)

diff --git a/keras_nlp/layers/multi_segment_packer_test.py b/keras_nlp/layers/multi_segment_packer_test.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tests for Transformer Decoder."""
+"""Tests for multi-segment packing."""
 
 import os
 
@@ -147,6 +147,52 @@ def test_pad_batched_inputs(self):
             ),
         )
 
+    def test_list_special_tokens(self):
+        seq1 = tf.ragged.constant([["a", "b", "c"], ["a", "b", "c"]])
+        seq2 = tf.ragged.constant([["x", "y", "z"], ["x"]])
+        packer = MultiSegmentPacker(
+            9,
+            start_value="[CLS]",
+            end_value="[SEP]",
+            sep_value=["[SEP]", "[SEP]"],
+            pad_value="[PAD]",
+            truncate="round_robin",
+        )
+        output = packer([seq1, seq2])
+        self.assertAllEqual(
+            output,
+            (
+                [
+                    [
+                        "[CLS]",
+                        "a",
+                        "b",
+                        "c",
+                        "[SEP]",
+                        "[SEP]",
+                        "x",
+                        "y",
+                        "[SEP]",
+                    ],
+                    [
+                        "[CLS]",
+                        "a",
+                        "b",
+                        "c",
+                        "[SEP]",
+                        "[SEP]",
+                        "x",
+                        "[SEP]",
+                        "[PAD]",
+                    ],
+                ],
+                [
+                    [0, 0, 0, 0, 0, 0, 1, 1, 1],
+                    [0, 0, 0, 0, 0, 0, 1, 1, 0],
+                ],
+            ),
+        )
+
     def test_config(self):
         seq1 = tf.ragged.constant([["a", "b", "c"], ["a", "b"]])
         seq2 = tf.ragged.constant([["x", "y", "z"], ["x", "y", "z"]])

diff --git a/keras_nlp/models/roberta/roberta_multi_segment_packer.py b/keras_nlp/models/roberta/roberta_multi_segment_packer.py
diff --git a/keras_nlp/models/roberta/roberta_preprocessor.py b/keras_nlp/models/roberta/roberta_preprocessor.py
@@ -17,10 +17,8 @@
 import copy
 
 from keras_nlp.api_export import keras_nlp_export
+from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
 from keras_nlp.models.preprocessor import Preprocessor
-from keras_nlp.models.roberta.roberta_multi_segment_packer import (
-    RobertaMultiSegmentPacker,
-)
 from keras_nlp.models.roberta.roberta_presets import backbone_presets
 from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
 from keras_nlp.utils.keras_utils import (
@@ -145,9 +143,10 @@ def __init__(
         super().__init__(**kwargs)
 
         self.tokenizer = tokenizer
-        self.packer = RobertaMultiSegmentPacker(
+        self.packer = MultiSegmentPacker(
             start_value=self.tokenizer.start_token_id,
             end_value=self.tokenizer.end_token_id,
+            sep_value=[self.tokenizer.end_token_id] * 2,
             pad_value=self.tokenizer.pad_token_id,
             truncate=truncate,
             sequence_length=sequence_length,
@@ -166,7 +165,7 @@ def get_config(self):
     def call(self, x, y=None, sample_weight=None):
         x = convert_inputs_to_list_of_tensor_segments(x)
         x = [self.tokenizer(segment) for segment in x]
-        token_ids = self.packer(x)
+        token_ids, _ = self.packer(x)
         x = {
             "token_ids": token_ids,
             "padding_mask": token_ids != self.tokenizer.pad_token_id,