fix: reverted efficientloftr embeddings computation to inference time with lru cache

sbucaille · sbucaille · commit 20cf94a06ff7 · 2025-08-20T19:48:42.000-04:00
diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py
@@ -68,8 +68,6 @@ class EfficientLoFTRConfig(PretrainedConfig):
             Kernel size used for the fine feature matching
         batch_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the batch normalization layers.
-        embedding_size (`List`, *optional*, defaults to [15, 20]):
-            The size (height, width) of the embedding for the position embeddings.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         partial_rotary_factor (`float`, *optional*, defaults to 4.0):
@@ -130,7 +128,6 @@ def __init__(
         coarse_matching_border_removal: int = 2,
         fine_kernel_size: int = 8,
         batch_norm_eps: float = 1e-5,
-        embedding_size: Optional[list[int]] = None,
         rope_theta: float = 10000.0,
         partial_rotary_factor: float = 4.0,
         rope_scaling: Optional[dict] = None,
@@ -187,7 +184,6 @@ def __init__(
         self.fine_matching_regress_temperature = fine_matching_regress_temperature
 
         self.num_key_value_heads = num_attention_heads
-        self.embedding_size = embedding_size if embedding_size is not None else [15, 20]
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling if rope_scaling is not None else {"rope_type": "default"}
 
diff --git a/src/transformers/models/efficientloftr/modeling_efficientloftr.py b/src/transformers/models/efficientloftr/modeling_efficientloftr.py
@@ -23,6 +23,7 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
+from ...pytorch_utils import compile_compatible_method_lru_cache
 from ...utils import (
     ModelOutput,
     TransformersKwargs,
@@ -68,6 +69,18 @@ class KeypointMatchingOutput(ModelOutput):
     attentions: Optional[tuple[torch.FloatTensor]] = None
 
 
+@compile_compatible_method_lru_cache(maxsize=32)
+def compute_embeddings(inv_freq: torch.Tensor, embed_height: int, embed_width: int, hidden_size: int) -> torch.Tensor:
+    i_indices = torch.ones(embed_height, embed_width).cumsum(0).float().unsqueeze(-1)
+    j_indices = torch.ones(embed_height, embed_width).cumsum(1).float().unsqueeze(-1)
+
+    emb = torch.zeros(1, embed_height, embed_width, hidden_size // 2)
+    emb[:, :, :, 0::2] = i_indices * inv_freq
+    emb[:, :, :, 1::2] = j_indices * inv_freq
+
+    return emb
+
+
 class EfficientLoFTRRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
 
@@ -80,23 +93,16 @@ def __init__(self, config: EfficientLoFTRConfig, device=None):
         inv_freq, _ = self.rope_init_fn(self.config, device)
         inv_freq_expanded = inv_freq[None, None, None, :].float().expand(1, 1, 1, -1)
 
-        embed_height, embed_width = config.embedding_size
-        i_indices = torch.ones(embed_height, embed_width).cumsum(0).float().unsqueeze(-1)
-        j_indices = torch.ones(embed_height, embed_width).cumsum(1).float().unsqueeze(-1)
-
-        emb = torch.zeros(1, embed_height, embed_width, self.config.hidden_size // 2)
-        emb[:, :, :, 0::2] = i_indices * inv_freq_expanded
-        emb[:, :, :, 1::2] = j_indices * inv_freq_expanded
-
-        self.register_buffer("inv_freq", emb, persistent=False)
+        self.register_buffer("inv_freq", inv_freq_expanded, persistent=False)
 
     @torch.no_grad()
     def forward(
         self, x: torch.Tensor, position_ids: Optional[tuple[torch.LongTensor, torch.LongTensor]] = None
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        features_height, features_width = x.shape[-2:]
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            emb = self.inv_freq
+            emb = compute_embeddings(self.inv_freq, features_height, features_width, self.config.hidden_size)
             sin = emb.sin()
             cos = emb.cos()