Enable Flash attention in Gemma (keras-team#2064)

divyashreepathihalli · web-flow · commit 5c80ed250a81 · 2025-02-20T16:13:45.000-08:00
* add flash attention to gemma

* update attention mask

* code reformat

* use flash attention detection from utils

* add gemmma flash attention

* enable only in jax backend

* update jax version

* Update requirements-jax-cuda.txt

* update jax version in requirements

* update to python 3.10

* add quotes on python version

* force jax to be 0.5.0

* check if dot product attention is supported

* update python version

* update python version

* unpin stable

* change back
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -58,7 +58,7 @@ development environment and run the unit tests. This is covered in section
 ### Step 3. Create a pull request
 
 Once the change is ready, open a pull request from your branch in your fork to
-the master branch in 
+the master branch in
 [keras-team/keras-hub](https://github.com/keras-team/keras-hub).
 
 ### Step 4. Sign the Contributor License Agreement
@@ -114,13 +114,13 @@ environement supports all backends without cuda, and each backend environement
 has cuda support.
 
 ```shell
-conda create -y -n keras-hub-cpu python=3.10
+conda create -y -n keras-hub-cpu python=3.9
 conda activate keras-hub-cpu
 pip install -r requirements.txt  # install deps
 pip install -e .  # install keras-hub
 
 for backend in "jax" "torch" "tensorflow"; do
-    conda create -y -n keras-hub-${backend} python=3.10
+    conda create -y -n keras-hub-${backend} python=3.9
     conda activate keras-hub-${backend}
     pip install -r requirements-${backend}-cuda.txt  # install deps
     pip install -e .  # install keras-hub
diff --git a/keras_hub/src/models/gemma/gemma_attention.py b/keras_hub/src/models/gemma/gemma_attention.py
@@ -4,6 +4,7 @@
 
 from keras_hub.src.layers.modeling.rotary_embedding import RotaryEmbedding
 from keras_hub.src.utils.keras_utils import clone_initializer
+from keras_hub.src.utils.keras_utils import has_flash_attention_support
 
 
 class CachedGemmaAttention(keras.layers.Layer):
@@ -117,6 +118,36 @@ def _compute_attention(
             query_normalization = 1 / np.sqrt(
                 self.hidden_dim // self.num_query_heads
             )
+        use_dot_product_attention = not (
+            self.dropout > 0.0 or (len(q.shape) != 4)
+        )
+        if has_flash_attention_support() and use_dot_product_attention:
+            if self.dropout > 0.0:
+                raise ValueError(
+                    "Flash attention does not support dropout. "
+                    "Please set `dropout` to 0.0."
+                )
+            if attention_mask is not None:
+                while len(attention_mask.shape) < 4:
+                    attention_mask = ops.expand_dims(
+                        attention_mask, axis=1
+                    )  # Add dimension for num_heads
+                if attention_mask.shape[1] != self.num_query_heads:
+                    attention_mask = ops.tile(
+                        attention_mask, [1, self.num_query_heads, 1, 1]
+                    )
+
+            attention_output = ops.dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                bias=None,
+                mask=attention_mask,
+                scale=query_normalization,
+                is_causal=True,
+                flash_attention=True,
+            )
+            return attention_output
 
         q *= ops.cast(query_normalization, dtype=q.dtype)
         q_shape = ops.shape(q)
@@ -131,8 +162,8 @@ def _compute_attention(
         )
         b, q_len, _, _, h = ops.shape(q)
 
+        # Fallback to standard attention if flash attention is disabled
         attention_logits = ops.einsum("btkgh,bskh->bkgts", q, k)
-
         if self.logit_soft_cap is not None:
             attention_logits = ops.divide(attention_logits, self.logit_soft_cap)
             attention_logits = ops.multiply(
diff --git a/keras_hub/src/utils/keras_utils.py b/keras_hub/src/utils/keras_utils.py
@@ -56,7 +56,19 @@ def standardize_data_format(data_format):
 
 
 def has_flash_attention_support():
-    if hasattr(keras.config, "is_flash_attention_enabled"):
+    if (
+        hasattr(keras.config, "is_flash_attention_enabled")
+        and keras.config.backend() == "jax"
+    ):
+        try:
+            from jax.nn import dot_product_attention as dot_product_attention
+        except ImportError:
+            logging.warning(
+                "Flash attention is not supported in your current JAX version. "
+                "Please update it by following the official guide: "
+                "https://jax.readthedocs.io/en/latest/installation.html"
+            )
+            return False
         return True
     else:
         return False