keras-team · mattdangerw · Feb 17, 2023 · Feb 5, 2023 · Feb 8, 2023 · Feb 8, 2023
diff --git a/keras_nlp/models/__init__.py b/keras_nlp/models/__init__.py
@@ -13,6 +13,10 @@
 # limitations under the License.
 
 from keras_nlp.models.albert.albert_backbone import AlbertBackbone
+from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM
+from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
+    AlbertMaskedLMPreprocessor,
+)
 from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
 from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
 from keras_nlp.models.bart.bart_backbone import BartBackbone

diff --git a/keras_nlp/models/albert/albert_masked_lm.py b/keras_nlp/models/albert/albert_masked_lm.py
@@ -0,0 +1,70 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Albert masked lm model."""
+
+import copy
+
+from tensorflow import keras
+
+from keras_nlp.layers.masked_lm_head import MaskedLMHead
+from keras_nlp.models.albert.albert_backbone import AlbertBackbone
+from keras_nlp.models.albert.albert_backbone import albert_kernel_initializer
+from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
+    AlbertMaskedLMPreprocessor,
+)
+from keras_nlp.models.albert.albert_presets import backbone_presets
+from keras_nlp.models.task import Task
+from keras_nlp.utils.python_utils import classproperty
+
+
+@keras.utils.register_keras_serializable(package="keras_nlp")
+class AlbertMaskedLM(Task):
+    def __init__(self, backbone, preprocessor=None, **kwargs):
+        inputs = {
+            **backbone.input,
+            "mask_positions": keras.Input(
+                shape=(None,), dtype="int32", name="mask_positions"
+            ),
+        }
+
+        backbone_outputs = backbone(backbone.input)
+        outputs = MaskedLMHead(
+            vocabulary_size=backbone.vocabulary_size,
+            embedding_weights=backbone.token_embedding.embeddings,
+            intermediate_activation="gelu",
+            kernel_initializer=albert_kernel_initializer(),
+            name="mlm_head",
+        )(backbone_outputs, inputs["mask_positions"])
+
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            include_preprocessing=preprocessor is not None,
+            **kwargs
+        )
+
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+
+    @classproperty
+    def backbone_cls(cls):
+        return AlbertBackbone
+
+    @classproperty
+    def preprocessor_cls(cls):
+        return AlbertMaskedLMPreprocessor
+
+    @classproperty
+    def presets(cls):
+        return copy.deepcopy(backbone_presets)
diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor.py
@@ -0,0 +1,89 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Albert masked language model preprocessor layer"""
+
+from absl import logging
+from tensorflow import keras
+
+from keras_nlp.layers.masked_lm_mask_generator import MaskedLMMaskGenerator
+from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor
+from keras_nlp.utils.keras_utils import pack_x_y_sample_weight
+
+
+@keras.utils.register_keras_serializable(package="keras_nlp")
+class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
+    def __init__(
+        self,
+        tokenizer,
+        sequence_length=512,
+        truncate="round_robin",
+        mask_selection_rate=0.15,
+        mask_selection_length=96,
+        mask_token_rate=0.8,
+        random_token_rate=0.1,
+        **kwargs,
+    ):
+        super().__init__(
+            tokenizer,
+            sequence_length=sequence_length,
+            truncate=truncate,
+            **kwargs,
+        )
+
+        self.masker = MaskedLMMaskGenerator(
+            mask_selection_rate=mask_selection_rate,
+            mask_selection_length=mask_selection_length,
+            mask_token_rate=mask_token_rate,
+            random_token_rate=random_token_rate,
+            vocabulary_size=tokenizer.vocabulary_size(),
+            mask_token_id=tokenizer.mask_token_id,
+            unselectable_token_ids=[
+                tokenizer.cls_token_id,
+                tokenizer.sep_token_id,
+                tokenizer.pad_token_id,
+            ],
+        )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "mask_selection_rate": self.masker.mask_selection_rate,
+                "mask_selection_length": self.masker.mask_selection_length,
+                "mask_token_rate": self.masker.mask_token_rate,
+                "random_token_rate": self.masker.random_token_rate,
+            }
+        )
+        return config
+
+    def call(self, x, y=None, sample_weight=None):
+        if y is not None or sample_weight is not None:
+            logging.warning(
+                f"{self.__class__.__name__} generates `y` and `sample_weight` "
+                "based on your input data, but your data already contains `y` "
+                "or `sample_weight`. Your `y` and `sample_weight` will be "
+                "ignored."
+            )
+
+        x = super().call(x)
+        token_ids, padding_mask = x["token_ids"], x["padding_mask"]
+        masker_outputs = self.masker(token_ids)
+        x = {
+            "token_ids": masker_outputs["token_ids"],
+            "padding_mask": padding_mask,
+            "mask_positions": masker_outputs["mask_positions"],
+        }
+        y = masker_outputs["mask_ids"]
+        sample_weight = masker_outputs["mask_weights"]
+        return pack_x_y_sample_weight(x, y, sample_weight)
diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py
@@ -0,0 +1,161 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Albert masked language model preprocessor layer."""
+
+import io
+import os
+
+import sentencepiece
+import tensorflow as tf
+from absl.testing import parameterized
+from tensorflow import keras
+
+from keras_nlp.models.albert.albert_masked_lm_preprocessor import (
+    AlbertMaskedLMPreprocessor,
+)
+from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer
+
+
+class AlbertaMaskedLMPreprocessorTest(tf.test.TestCase, parameterized.TestCase):
+    def setUp(self):
+        vocab_data = tf.data.Dataset.from_tensor_slices(
+            ["the quick brown fox", "the earth is round"]
+        )
+
+        bytes_io = io.BytesIO()
+        sentencepiece.SentencePieceTrainer.train(
+            sentence_iterator=vocab_data.as_numpy_iterator(),
+            model_writer=bytes_io,
+            vocab_size=10,
+            model_type="WORD",
+            pad_id=0,
+            unk_id=1,
+            bos_id=2,
+            eos_id=3,
+            pad_piece="<pad>",
+            unk_piece="<unk>",
+            bos_piece="[CLS]",
+            eos_piece="[SEP]",
+        )
+
+        proto = bytes_io.getvalue()
+
+        tokenizer = AlbertTokenizer(proto=proto)
+
+        self.preprocessor = AlbertMaskedLMPreprocessor(
+            tokenizer=tokenizer,
+            # Simplify out testing by masking every available token.
+            mask_selection_rate=1.0,
+            mask_token_rate=1.0,
+            random_token_rate=0.0,
+            mask_selection_length=5,
+            sequence_length=12,
+        )
+
+    def test_preprocess_strings(self):
+        input_data = " airplane at airport"
+
+        x, y, sw = self.preprocessor(input_data)
+        self.assertAllEqual(
+            x["token_ids"], [0, 12, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1]
+        )
+        self.assertAllEqual(
+            x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
+        )
+        self.assertAllEqual(x["mask_positions"], [1, 2, 3, 4, 5])
+        self.assertAllEqual(y, [3, 4, 5, 3, 6])
+        self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0, 1.0])
+
+    def test_preprocess_list_of_strings(self):
+        input_data = [" airplane at airport"] * 4
+
+        x, y, sw = self.preprocessor(input_data)
+        self.assertAllEqual(
+            x["token_ids"], [[0, 12, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1]] * 4
+        )
+        self.assertAllEqual(
+            x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]] * 4
+        )
+        self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4, 5]] * 4)
+        self.assertAllEqual(y, [[3, 4, 5, 3, 6]] * 4)
+        self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0, 1.0]] * 4)
+
+    def test_preprocess_dataset(self):
+        sentences = tf.constant([" airplane at airport"] * 4)
+        ds = tf.data.Dataset.from_tensor_slices(sentences)
+        ds = ds.map(self.preprocessor)
+        x, y, sw = ds.batch(4).take(1).get_single_element()
+        self.assertAllEqual(
+            x["token_ids"], [[0, 12, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1]] * 4
+        )
+        self.assertAllEqual(
+            x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]] * 4
+        )
+        self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4, 5]] * 4)
+        self.assertAllEqual(y, [[3, 4, 5, 3, 6]] * 4)
+        self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0, 1.0]] * 4)
+
+    def test_mask_multiple_sentences(self):
+        sentence_one = tf.constant(" airplane")
+        sentence_two = tf.constant(" kohli")
+
+        x, y, sw = self.preprocessor((sentence_one, sentence_two))
+        self.assertAllEqual(
+            x["token_ids"], [0, 12, 12, 2, 2, 12, 12, 2, 1, 1, 1, 1]
+        )
+        self.assertAllEqual(
+            x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
+        )
+        self.assertAllEqual(x["mask_positions"], [1, 2, 5, 6, 0])
+        self.assertAllEqual(y, [3, 4, 7, 8, 0])
+        self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0, 0.0])
+
+    def test_no_masking_zero_rate(self):
+        no_mask_preprocessor = AlbertMaskedLMPreprocessor(
+            self.preprocessor.tokenizer,
+            mask_selection_rate=0.0,
+            mask_selection_length=5,
+            sequence_length=12,
+        )
+        input_data = " airplane at airport"
+
+        x, y, sw = no_mask_preprocessor(input_data)
+        self.assertAllEqual(
+            x["token_ids"], [0, 3, 4, 5, 3, 6, 2, 1, 1, 1, 1, 1]
+        )
+        self.assertAllEqual(
+            x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
+        )
+        self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0, 0])
+        self.assertAllEqual(y, [0, 0, 0, 0, 0])
+        self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0, 0.0])
+
+    @parameterized.named_parameters(
+        ("tf_format", "tf", "model"),
+        ("keras_format", "keras_v3", "model.keras"),
+    )
+    def test_saved_model(self, save_format, filename):
+        input_data = tf.constant([" airplane at airport"])
+
+        inputs = keras.Input(dtype="string", shape=())
+        outputs = self.preprocessor(inputs)
+        model = keras.Model(inputs, outputs)
+
+        path = os.path.join(self.get_temp_dir(), filename)
+        model.save(path, save_format=save_format)
+
+        restored_model = keras.models.load_model(path)
+        outputs = model(input_data)[0]["token_ids"]
+        restored_outputs = restored_model(input_data)[0]["token_ids"]
+        self.assertAllEqual(outputs, restored_outputs)