cleanup nits

mirceamironenco · mirceamironenco · commit 65f7498aa66f · 2024-11-08T21:27:59.000+02:00
diff --git a/torchtune/modules/attention.py b/torchtune/modules/attention.py
@@ -270,7 +270,8 @@ def forward(
                 k = self.pos_embeddings(k, input_pos=input_pos)
 
             # k,v shape: [b, n_kv, s_y, h_d]
-            k, v = k.transpose(1, 2), v.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
 
             # Update key-value cache
             if self.kv_cache is not None and self.cache_enabled:
diff --git a/torchtune/modules/attention_utils.py b/torchtune/modules/attention_utils.py
@@ -183,6 +183,7 @@ def _attention_call(
             dropout_p: float,
             is_causal: bool,
         ) -> torch.Tensor:
+
             # Flex attention uses the BlockMask
             # (https://github.com/pytorch/pytorch/blob/main/torch/nn/attention/flex_attention.py#L168)
             # instead of a traditional boolean tensor mask. If this is passed in,
diff --git a/torchtune/modules/kv_cache.py b/torchtune/modules/kv_cache.py
@@ -51,7 +51,7 @@ def reset(self) -> None:
 
     @property
     def size(self) -> int:
-        return int(self.cache_pos[0].item())
+        return self.cache_pos[0].item()
 
     def update(
         self, k_val: torch.Tensor, v_val: torch.Tensor