|
15 | 15 | from typing import Optional, Tuple, Union |
16 | 16 |
|
17 | 17 | import torch |
18 | | -from executorch import version as executorch_version |
19 | | -from packaging import version as pkg_version |
| 18 | +from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa # noqa |
20 | 19 |
|
21 | 20 |
|
22 | | -if pkg_version.parse(executorch_version.__version__) >= pkg_version.parse("0.6.0"): |
23 | | - from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa # noqa |
| 21 | +def custom_sdpa_with_start_pos_forward( |
| 22 | + module: torch.nn.Module, |
| 23 | + query: torch.Tensor, |
| 24 | + key: torch.Tensor, |
| 25 | + value: torch.Tensor, |
| 26 | + attention_mask: Union[torch.Tensor, "BlockMask"], # noqa |
| 27 | + scaling: Optional[float] = None, |
| 28 | + softcap: Optional[float] = None, |
| 29 | + head_mask: Optional[torch.Tensor] = None, |
| 30 | + **kwargs, |
| 31 | +) -> Tuple[torch.Tensor, None]: |
| 32 | + # This is before the transpose |
| 33 | + max_seq_len = key.shape[2] |
24 | 34 |
|
25 | | - def custom_sdpa_with_start_pos_forward( |
26 | | - module: torch.nn.Module, |
27 | | - query: torch.Tensor, |
28 | | - key: torch.Tensor, |
29 | | - value: torch.Tensor, |
30 | | - attention_mask: Union[torch.Tensor, "BlockMask"], # noqa |
31 | | - scaling: Optional[float] = None, |
32 | | - softcap: Optional[float] = None, |
33 | | - head_mask: Optional[torch.Tensor] = None, |
34 | | - **kwargs, |
35 | | - ) -> Tuple[torch.Tensor, None]: |
36 | | - # This is before the transpose |
37 | | - max_seq_len = key.shape[2] |
| 35 | + # FA2 uses non-transposed inputs |
| 36 | + query = query.transpose(1, 2) |
| 37 | + key = key.transpose(1, 2) |
| 38 | + value = value.transpose(1, 2) |
38 | 39 |
|
39 | | - # FA2 uses non-transposed inputs |
40 | | - query = query.transpose(1, 2) |
41 | | - key = key.transpose(1, 2) |
42 | | - value = value.transpose(1, 2) |
| 40 | + # Convert the hell out of the inputs to fp32 and back |
| 41 | + input_dtype = query.dtype |
| 42 | + query = query.to(torch.float32) |
| 43 | + key = key.to(torch.float32) |
| 44 | + value = value.to(torch.float32) |
43 | 45 |
|
44 | | - # Convert the hell out of the inputs to fp32 and back |
45 | | - input_dtype = query.dtype |
46 | | - query = query.to(torch.float32) |
47 | | - key = key.to(torch.float32) |
48 | | - value = value.to(torch.float32) |
| 46 | + # Ignore the causal flag from kwargs but use the one in module |
| 47 | + kwargs.pop("is_causal", None) |
49 | 48 |
|
50 | | - # Ignore the causal flag from kwargs but use the one in module |
51 | | - kwargs.pop("is_causal", None) |
52 | | - |
53 | | - # Calculate the input pos from attention mask. |
54 | | - # Branch out for float vs bool mask |
55 | | - # assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix." |
56 | | - attention_mask = attention_mask.reshape(-1, max_seq_len) |
57 | | - first_row_mask = attention_mask[0, :] |
58 | | - # [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3 |
59 | | - start_pos = torch.argmin(first_row_mask).item() - 1 |
60 | | - output = torch.ops.llama.custom_sdpa( |
61 | | - query, |
62 | | - key, |
63 | | - value, |
64 | | - start_pos=start_pos, |
65 | | - attn_mask=None, |
66 | | - drpout_p=0.0, |
67 | | - is_causal=module.is_causal, |
68 | | - scale=scaling, |
69 | | - ) |
70 | | - return output.to(input_dtype), None |
| 49 | + # Calculate the input pos from attention mask. |
| 50 | + # Branch out for float vs bool mask |
| 51 | + # assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix." |
| 52 | + attention_mask = attention_mask.reshape(-1, max_seq_len) |
| 53 | + first_row_mask = attention_mask[0, :] |
| 54 | + # [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3 |
| 55 | + start_pos = torch.argmin(first_row_mask).item() - 1 |
| 56 | + output = torch.ops.llama.custom_sdpa( |
| 57 | + query, |
| 58 | + key, |
| 59 | + value, |
| 60 | + start_pos=start_pos, |
| 61 | + attn_mask=None, |
| 62 | + drpout_p=0.0, |
| 63 | + is_causal=module.is_causal, |
| 64 | + scale=scaling, |
| 65 | + ) |
| 66 | + return output.to(input_dtype), None |
0 commit comments