Merge branch 'add_small_100_preset' of https://github.com/pkgoogle/keras-hub into add_small_100_preset

pkgoogle · pkgoogle · commit 3ab51896dd99 · 2025-02-21T13:44:52.000-08:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -58,7 +58,7 @@ development environment and run the unit tests. This is covered in section
 ### Step 3. Create a pull request
 
 Once the change is ready, open a pull request from your branch in your fork to
-the master branch in 
+the master branch in
 [keras-team/keras-hub](https://github.com/keras-team/keras-hub).
 
 ### Step 4. Sign the Contributor License Agreement
@@ -114,13 +114,13 @@ environement supports all backends without cuda, and each backend environement
 has cuda support.
 
 ```shell
-conda create -y -n keras-hub-cpu python=3.10
+conda create -y -n keras-hub-cpu python=3.9
 conda activate keras-hub-cpu
 pip install -r requirements.txt  # install deps
 pip install -e .  # install keras-hub
 
 for backend in "jax" "torch" "tensorflow"; do
-    conda create -y -n keras-hub-${backend} python=3.10
+    conda create -y -n keras-hub-${backend} python=3.9
     conda activate keras-hub-${backend}
     pip install -r requirements-${backend}-cuda.txt  # install deps
     pip install -e .  # install keras-hub
diff --git a/README.md b/README.md
@@ -102,6 +102,13 @@ To install the latest KerasHub release with Keras 3, simply run:
 pip install --upgrade keras-hub
 ```
 
+Our text tokenizers are based on TensorFlow Text. Hence, if you are using any
+model which has language as a modality, you will have to run:
+
+```
+pip install --upgrade keras-hub[nlp]
+```
+
 To install the latest nightly changes for both KerasHub and Keras, you can use
 our nightly package.
 
diff --git a/keras_hub/src/models/backbone.py b/keras_hub/src/models/backbone.py
@@ -186,14 +186,23 @@ def save_to_preset(self, preset_dir):
         saver = get_preset_saver(preset_dir)
         saver.save_backbone(self)
 
+    def get_lora_target_names(self):
+        """Returns list of layer names which are to be LoRA-fied.
+
+        Subclasses can override this method if the names of layers to be
+        LoRa-fied are different.
+        """
+        return ["query_dense", "value_dense", "query", "value"]
+
     def enable_lora(self, rank):
         """Enable Lora on the backbone.
 
         Calling this method will freeze all weights on the backbone,
         while enabling Lora on the query & value `EinsumDense` layers
         of the attention layers.
         """
-        target_names = ["query_dense", "value_dense", "query", "value"]
+        target_names = self.get_lora_target_names()
+
         self.trainable = True
         self._lora_enabled_layers = []
         self._lora_rank = rank
diff --git a/keras_hub/src/models/gemma/gemma_attention.py b/keras_hub/src/models/gemma/gemma_attention.py
@@ -4,6 +4,7 @@
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 class CachedGemmaAttention(keras.layers.Layer):
@@ -117,6 +118,36 @@ def _compute_attention(
             query_normalization = 1 / np.sqrt(
                 self.hidden_dim // self.num_query_heads
             )
+        use_dot_product_attention = not (
+            self.dropout > 0.0 or (len(q.shape) != 4)
+        )
+        if has_flash_attention_support() and use_dot_product_attention:
+            if self.dropout > 0.0:
+                raise ValueError(
+                    "Flash attention does not support dropout. "
+                    "Please set `dropout` to 0.0."
+                )
+            if attention_mask is not None:
+                while len(attention_mask.shape) < 4:
+                    attention_mask = ops.expand_dims(
+                        attention_mask, axis=1
+                    )  # Add dimension for num_heads
+                if attention_mask.shape[1] != self.num_query_heads:
+                    attention_mask = ops.tile(
+                        attention_mask, [1, self.num_query_heads, 1, 1]
+                    )
+
+            attention_output = ops.dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                bias=None,
+                mask=attention_mask,
+                scale=query_normalization,
+                is_causal=True,
+                flash_attention=True,
+            )
+            return attention_output
 
         q *= ops.cast(query_normalization, dtype=q.dtype)
         q_shape = ops.shape(q)
@@ -131,8 +162,8 @@ def _compute_attention(
         )
         b, q_len, _, _, h = ops.shape(q)
 
+        # Fallback to standard attention if flash attention is disabled
         attention_logits = ops.einsum("btkgh,bskh->bkgts", q, k)
-
         if self.logit_soft_cap is not None:
             attention_logits = ops.divide(attention_logits, self.logit_soft_cap)
             attention_logits = ops.multiply(
diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py b/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py
@@ -274,6 +274,13 @@ def __init__(
         # Keep the image_sequence_length as a backbone property for easy access.
         self.image_sequence_length = self.vit_encoder.image_sequence_length
 
+    def get_lora_target_names(self):
+        target_names = super().get_lora_target_names()
+
+        # Add these for `PaliGemmaVITAttention`.
+        target_names += ["query_proj", "value_proj"]
+        return target_names
+
     def get_config(self):
         config = super().get_config()
         config.update(
diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_presets.py b/keras_hub/src/models/pali_gemma/pali_gemma_presets.py
@@ -83,6 +83,96 @@
         },
         "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_ft_docci_10b_448/2",
     },
+    "pali_gemma2_mix_3b_224": {
+        "metadata": {
+            "description": (
+                "3 billion parameter, image size 224, 27-layer for "
+                "SigLIP-So400m vision encoder and 26-layer Gemma2 2B lanuage "
+                "model. This model has been fine-tuned on a wide range of "
+                "vision-language tasks and domains."
+            ),
+            "params": 3032094960,
+            "official_name": "PaliGemma2",
+            "path": "pali_gemma2",
+            "model_card": "https://www.kaggle.com/models/google/paligemma-2",
+        },
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_3b_224/2",
+    },
+    "pali_gemma2_mix_3b_448": {
+        "metadata": {
+            "description": (
+                "3 billion parameter, image size 448, 27-layer for "
+                "SigLIP-So400m vision encoder and 26-layer Gemma2 2B lanuage "
+                "model. This model has been fine-tuned on a wide range of "
+                "vision-language tasks and domains."
+            ),
+            "params": 3032979696,
+            "official_name": "PaliGemma2",
+            "path": "pali_gemma2",
+            "model_card": "https://www.kaggle.com/models/google/paligemma-2",
+        },
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_3b_448/2",
+    },
+    "pali_gemma2_mix_10b_224": {
+        "metadata": {
+            "description": (
+                "10 billion parameter, image size 224, 27-layer for "
+                "SigLIP-So400m vision encoder and 42-layer Gemma2 9B lanuage "
+                "model. This model has been fine-tuned on a wide range of "
+                "vision-language tasks and domains."
+            ),
+            "params": 9662409456,
+            "official_name": "PaliGemma2",
+            "path": "pali_gemma2",
+            "model_card": "https://www.kaggle.com/models/google/paligemma-2",
+        },
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_10b_224/2",
+    },
+    "pali_gemma2_mix_10b_448": {
+        "metadata": {
+            "description": (
+                "10 billion parameter, image size 448, 27-layer for "
+                "SigLIP-So400m vision encoder and 42-layer Gemma2 9B lanuage "
+                "model. This model has been fine-tuned on a wide range of "
+                "vision-language tasks and domains."
+            ),
+            "params": 9663294192,
+            "official_name": "PaliGemma2",
+            "path": "pali_gemma2",
+            "model_card": "https://www.kaggle.com/models/google/paligemma-2",
+        },
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_mix_10b_448/2",
+    },
+    "pali_gemma2_mix_28b_224": {
+        "metadata": {
+            "description": (
+                "28 billion parameter, image size 224, 27-layer for "
+                "SigLIP-So400m vision encoder and 46-layer Gemma2 27B lanuage "
+                "model. This model has been fine-tuned on a wide range of "
+                "vision-language tasks and domains."
+            ),
+            "params": 27650192112,
+            "official_name": "PaliGemma2",
+            "path": "pali_gemma2",
+            "model_card": "https://www.kaggle.com/models/google/paligemma-2",
+        },
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_28b_mix_224/2",
+    },
+    "pali_gemma2_mix_28b_448": {
+        "metadata": {
+            "description": (
+                "28 billion parameter, image size 448, 27-layer for "
+                "SigLIP-So400m vision encoder and 46-layer Gemma2 27B lanuage "
+                "model. This model has been fine-tuned on a wide range of "
+                "vision-language tasks and domains."
+            ),
+            "params": 27650192112,
+            "official_name": "PaliGemma2",
+            "path": "pali_gemma2",
+            "model_card": "https://www.kaggle.com/models/google/paligemma-2",
+        },
+        "kaggle_handle": "kaggle://keras/paligemma2/keras/pali_gemma2_28b_mix_448/2",
+    },
     "pali_gemma2_pt_3b_224": {
         "metadata": {
             "description": (
@@ -181,7 +271,7 @@
                 "model. This model has been pre-trained on a mixture of "
                 "datasets."
             ),
-            "params": 9662409456,
+            "params": 27650192112,
             "official_name": "PaliGemma2",
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
@@ -196,7 +286,7 @@
                 "model. This model has been pre-trained on a mixture of "
                 "datasets."
             ),
-            "params": 9663294192,
+            "params": 27650192112,
             "official_name": "PaliGemma2",
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
@@ -211,7 +301,7 @@
                 "model. This model has been pre-trained on a mixture of "
                 "datasets."
             ),
-            "params": 9666833136,
+            "params": 27650192112,
             "official_name": "PaliGemma2",
             "path": "pali_gemma2",
             "model_card": "https://www.kaggle.com/models/google/paligemma-2",
diff --git a/keras_hub/src/utils/keras_utils.py b/keras_hub/src/utils/keras_utils.py
@@ -56,7 +56,19 @@ def standardize_data_format(data_format):
 
 
 def has_flash_attention_support():
-    if hasattr(keras.config, "is_flash_attention_enabled"):
+    if (
+        hasattr(keras.config, "is_flash_attention_enabled")
+        and keras.config.backend() == "jax"
+    ):
+        try:
+            from jax.nn import dot_product_attention as dot_product_attention
+        except ImportError:
+            logging.warning(
+                "Flash attention is not supported in your current JAX version. "
+                "Please update it by following the official guide: "
+                "https://jax.readthedocs.io/en/latest/installation.html"
+            )
+            return False
         return True
     else:
         return False
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 # Tensorflow.
-tensorflow-cpu~=2.18
+tensorflow-cpu~=2.18.0;sys_platform != 'darwin'
+tensorflow~=2.18.0;sys_platform == 'darwin'
 tensorflow-text~=2.18
 
 # Torch.
diff --git a/setup.py b/setup.py
@@ -45,13 +45,13 @@ def get_version(rel_path):
         "regex",
         "rich",
         "kagglehub",
-        "tensorflow-text",
     ],
     extras_require={
         "extras": [
             "rouge-score",
             "sentencepiece",
         ],
+        "nlp": ["tensorflow-text"],
     },
     # Supported Python versions
     python_requires=">=3.9",
diff --git a/tools/checkpoint_conversion/convert_pali_gemma2_checkpoints.py b/tools/checkpoint_conversion/convert_pali_gemma2_checkpoints.py
@@ -69,6 +69,12 @@
     "pali_gemma2_10b_ft_docci_448": (
         "google/paligemma-2/jax/paligemma2-10b-ft-docci-448"
     ),
+    "pali_gemma2_3b_mix_224": "google/paligemma-2/jax/paligemma2-3b-mix-224",
+    "pali_gemma2_3b_mix_448": "google/paligemma-2/jax/paligemma2-3b-mix-448",
+    "pali_gemma2_10b_mix_224": "google/paligemma-2/jax/paligemma2-10b-mix-224",
+    "pali_gemma2_10b_mix_448": "google/paligemma-2/jax/paligemma2-10b-mix-448",
+    "pali_gemma2_28b_mix_224": "google/paligemma-2/jax/paligemma2-28b-mix-224",
+    "pali_gemma2_28b_mix_448": "google/paligemma-2/jax/paligemma2-28b-mix-448",
     "pali_gemma2_3b_pt_224": "google/paligemma-2/jax/paligemma2-3b-pt-224",
     "pali_gemma2_3b_pt_448": "google/paligemma-2/jax/paligemma2-3b-pt-448",
     "pali_gemma2_3b_pt_896": "google/paligemma-2/jax/paligemma2-3b-pt-896",