huggingface · gante · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
@@ -385,11 +385,16 @@ def forward(
 
         if past_key_value is not None:
             # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            if getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window:
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
                 slicing_tokens = 1 - self.config.sliding_window
 
-                past_key = past_key_value[0]
-                past_value = past_key_value[1]
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
 
                 past_key = past_key[:, :, slicing_tokens:, :].contiguous()
                 past_value = past_value[:, :, slicing_tokens:, :].contiguous()
@@ -400,8 +405,6 @@ def forward(
                         f" {past_key.shape}"
                     )
 
-                past_key_value = (past_key, past_value)
-
                 if attention_mask is not None:
                     attention_mask = attention_mask[:, slicing_tokens:]
                     attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)

@@ -436,11 +436,16 @@ def forward(
 
         if past_key_value is not None:
             # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            if getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window:
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
                 slicing_tokens = 1 - self.config.sliding_window
 
-                past_key = past_key_value[0]
-                past_value = past_key_value[1]
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
 
                 past_key = past_key[:, :, slicing_tokens:, :].contiguous()
                 past_value = past_value[:, :, slicing_tokens:, :].contiguous()
@@ -451,8 +456,6 @@ def forward(
                         f" {past_key.shape}"
                     )
 
-                past_key_value = (past_key, past_value)
-
                 if attention_mask is not None:
                     attention_mask = attention_mask[:, slicing_tokens:]
                     attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)