| 
 | 1 | +# Copyright (c) Meta Platforms, Inc. and affiliates.  | 
 | 2 | +# All rights reserved.  | 
 | 3 | +#  | 
 | 4 | +# This source code is licensed under the BSD-style license found in the  | 
 | 5 | +# LICENSE file in the root directory of this source tree.  | 
 | 6 | + | 
 | 7 | +import pytest  | 
 | 8 | +import torch  | 
 | 9 | + | 
 | 10 | +from torchtune.models.gemma2._attention_mask import get_sliding_attention_mask  | 
 | 11 | + | 
 | 12 | + | 
 | 13 | +class TestGetSlidingAttentionMask:  | 
 | 14 | +    @pytest.fixture  | 
 | 15 | +    def basic_params(self):  | 
 | 16 | +        return {"bsz": 2, "seq_len": 4, "sliding_window_size": 2, "device": None}  | 
 | 17 | + | 
 | 18 | +    def test_get_sliding_attention_mask(self, basic_params):  | 
 | 19 | +        """Test that when mask is None, a causal mask is created and sliding window is applied."""  | 
 | 20 | +        bsz = 2  | 
 | 21 | +        seq_len = 4  | 
 | 22 | +        sliding_window_size = 2  | 
 | 23 | +        mask = get_sliding_attention_mask(  | 
 | 24 | +            mask=None,  | 
 | 25 | +            sliding_window_size=basic_params["sliding_window_size"],  | 
 | 26 | +            bsz=basic_params["bsz"],  | 
 | 27 | +            seq_len=basic_params["seq_len"],  | 
 | 28 | +            device=basic_params["device"],  | 
 | 29 | +        )  | 
 | 30 | + | 
 | 31 | +        assert mask.shape == (  | 
 | 32 | +            basic_params["bsz"],  | 
 | 33 | +            basic_params["seq_len"],  | 
 | 34 | +            basic_params["seq_len"],  | 
 | 35 | +        )  | 
 | 36 | +        assert mask.dtype == torch.bool  | 
 | 37 | + | 
 | 38 | +        # Check that the mask has the expected sliding window pattern  | 
 | 39 | +        # True positions can be attended to, False positions are masked  | 
 | 40 | +        expected_pattern = torch.tensor(  | 
 | 41 | +            [  | 
 | 42 | +                [True, False, False, False],  | 
 | 43 | +                [True, True, False, False],  | 
 | 44 | +                [False, True, True, False],  | 
 | 45 | +                [False, False, True, True],  | 
 | 46 | +            ],  | 
 | 47 | +            dtype=torch.bool,  | 
 | 48 | +        )  | 
 | 49 | + | 
 | 50 | +        # Check first batch element  | 
 | 51 | +        torch.testing.assert_close(mask[0], expected_pattern)  | 
 | 52 | +        # All batch elements should be identical  | 
 | 53 | +        torch.testing.assert_close(mask[0], mask[1])  | 
 | 54 | + | 
 | 55 | +    def test_get_sliding_attention_mask_different_window_sizes(self):  | 
 | 56 | +        """Test sliding window with different window sizes."""  | 
 | 57 | +        bsz, seq_len = 1, 5  | 
 | 58 | + | 
 | 59 | +        # Test window size 1 (only current position)  | 
 | 60 | +        mask = get_sliding_attention_mask(  | 
 | 61 | +            mask=None,  | 
 | 62 | +            sliding_window_size=1,  | 
 | 63 | +            bsz=bsz,  | 
 | 64 | +            seq_len=seq_len,  | 
 | 65 | +            device=None,  | 
 | 66 | +        )  | 
 | 67 | + | 
 | 68 | +        expected_window_1 = torch.tensor(  | 
 | 69 | +            [  | 
 | 70 | +                [True, False, False, False, False],  | 
 | 71 | +                [False, True, False, False, False],  | 
 | 72 | +                [False, False, True, False, False],  | 
 | 73 | +                [False, False, False, True, False],  | 
 | 74 | +                [False, False, False, False, True],  | 
 | 75 | +            ],  | 
 | 76 | +            dtype=torch.bool,  | 
 | 77 | +        )  | 
 | 78 | + | 
 | 79 | +        torch.testing.assert_close(mask[0], expected_window_1)  | 
 | 80 | + | 
 | 81 | +        # Test window size 3  | 
 | 82 | +        mask = get_sliding_attention_mask(  | 
 | 83 | +            mask=None,  | 
 | 84 | +            sliding_window_size=3,  | 
 | 85 | +            bsz=bsz,  | 
 | 86 | +            seq_len=seq_len,  | 
 | 87 | +            device=None,  | 
 | 88 | +        )  | 
 | 89 | + | 
 | 90 | +        expected_window_3 = torch.tensor(  | 
 | 91 | +            [  | 
 | 92 | +                [True, False, False, False, False],  | 
 | 93 | +                [True, True, False, False, False],  | 
 | 94 | +                [True, True, True, False, False],  | 
 | 95 | +                [False, True, True, True, False],  | 
 | 96 | +                [False, False, True, True, True],  | 
 | 97 | +            ],  | 
 | 98 | +            dtype=torch.bool,  | 
 | 99 | +        )  | 
 | 100 | + | 
 | 101 | +        torch.testing.assert_close(mask[0], expected_window_3)  | 
 | 102 | + | 
 | 103 | +    def test_get_sliding_attention_mask_large_window(self):  | 
 | 104 | +        """Test sliding window larger than sequence length."""  | 
 | 105 | +        bsz, seq_len = 1, 3  | 
 | 106 | +        sliding_window_size = 5  # Larger than seq_len  | 
 | 107 | + | 
 | 108 | +        mask = get_sliding_attention_mask(  | 
 | 109 | +            mask=None,  | 
 | 110 | +            sliding_window_size=sliding_window_size,  | 
 | 111 | +            bsz=bsz,  | 
 | 112 | +            seq_len=seq_len,  | 
 | 113 | +            device=None,  | 
 | 114 | +        )  | 
 | 115 | + | 
 | 116 | +        # Should behave like a regular causal mask when window is larger than seq_len  | 
 | 117 | +        expected_causal = torch.tensor(  | 
 | 118 | +            [  | 
 | 119 | +                [True, False, False],  | 
 | 120 | +                [True, True, False],  | 
 | 121 | +                [True, True, True],  | 
 | 122 | +            ],  | 
 | 123 | +            dtype=torch.bool,  | 
 | 124 | +        )  | 
 | 125 | + | 
 | 126 | +        torch.testing.assert_close(mask[0], expected_causal)  | 
0 commit comments