feat: extract rev in attn_implementation kernels via @ (#40009)

drbh · web-flow · commit 1cea763ba422 · 2025-08-11T15:14:13.000-04:00
* feat: extract rev in attn_implementation kernels via @

* fix: adjust for ruff

* fix: update regex and add explanatory comment

* fix: move attn_implementation kernel doc

* fix: remove extra line
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -2721,7 +2721,7 @@ def _check_and_adjust_attn_implementation(
             None to sdpa (to potentially eager).
         """
         applicable_attn_implementation = "sdpa" if attn_implementation is None else attn_implementation
-        if re.match(r"^[^/:]+/[^/:]+:?[^/:]+$", applicable_attn_implementation):
+        if re.match(r"^[^/:]+/[^/:]+(?:@[^/:]+)?(?::[^/:]+)?$", applicable_attn_implementation):
             if not is_kernels_available():
                 raise ValueError("kernels is not installed. Please install it with `pip install kernels`.")
             attention_wrapper = None
@@ -2738,8 +2738,12 @@ def _check_and_adjust_attn_implementation(
                 repo_id = applicable_attn_implementation
                 kernel_name = None
             repo_id = repo_id.strip()
+            # extract the rev after the @ if it exists
+            repo_id, _, rev = repo_id.partition("@")
+            repo_id = repo_id.strip()
+            rev = rev.strip() if rev else None
             try:
-                kernel = get_kernel(repo_id)
+                kernel = get_kernel(repo_id, revision=rev)
                 if hasattr(kernel, "flash_attn_varlen_func"):
                     if attention_wrapper is None:
                         attention_wrapper = flash_attention_forward
@@ -4494,6 +4498,22 @@ def from_pretrained(
             attn_implementation (`str`, *optional*):
                 The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)), or `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
 
+                Accept HF kernel references in the form:
+                  <namespace>/<repo_name>[@<revision>][:<kernel_name>]
+
+                - <namespace> and <repo_name> are any non-"/" and non-":" sequences.
+                - "@<revision>" is optional (branch, tag, or commit-ish), e.g. "@main", "@v1.2.0", "@abc123".
+                - ":<kernel_name>" is optional and selects a function inside the kernel repo.
+                - Both options can appear together and in this order only: @revision first, then :kernel_name.
+                - We intentionally allow a leading "<wrapper>|" prefix (e.g., "flash|...") because the code
+                  strips it before loading; '|' is not excluded in the character classes here.
+
+                Examples that match:
+                  "org/model"
+                  "org/model@main"
+                  "org/model:custom_kernel"
+                  "org/model@v1.2.3:custom_kernel"
+
             > Parameters for big model inference
 
             torch_dtype (`str` or `torch.dtype`, *optional*):