unslothai · danielhanchen · Sep 8, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
@@ -1033,7 +1033,7 @@ def to_sharegpt(
     merged_prompt = "",
     merged_column_name = "instruction",
     output_column_name = "output",
-    remove_unsued_columns = True,
+    remove_unused_columns = True,
     conversation_extension = 1,
     random_state = 3407,
 ):
@@ -1047,7 +1047,7 @@ def to_sharegpt(
     merged_prompt = "",                 Prompt to merge columns into 1 input
     merged_column_name = "instruction", Final column name for the input  field
     output_column_name = "output",      Final column name for the output field
-    remove_unsued_columns = True,
+    remove_unused_columns = True,
     conversation_extension = 1,         Automatically combines `conversation_extension` convos into 1
     random_state = 3407,
     """
@@ -1068,8 +1068,8 @@ def __convert_to_sharegpt__(examples):
         assistants = examples[output_column_name]
         texts = [
             [
-                {"from" : "user",      "content" : str(user)     },
-                {"from" : "assistant", "content" : str(assistant)},
+                {"from" : "human", "value" : str(user)     },
+                {"from" : "gpt",   "value" : str(assistant)},
             ] \
             for user, assistant in zip(users, assistants)
         ]
@@ -1080,8 +1080,8 @@ def __convert_to_sharegpt__(examples):
         __convert_to_sharegpt__,
         batched = True,
         desc = "Converting to ShareGPT",
-        # Remove unsued columns!
-        remove_columns = dataset.column_names if remove_unsued_columns else None,
+        # Remove unused columns!
+        remove_columns = dataset.column_names if remove_unused_columns else None,
     )
 
     # Randomnly concat conversations to create a long stream!
@@ -1115,8 +1115,8 @@ def __convert_to_sharegpt__(examples):
         __combine_conversations__,
         batched = True,
         desc = "Extending conversations",
-        # Remove unsued columns!
-        remove_columns = dataset.column_names if remove_unsued_columns else None,
+        # Remove unused columns!
+        remove_columns = dataset.column_names if remove_unused_columns else None,
     )
     return dataset
 pass

diff --git a/unsloth/kernels/__init__.py b/unsloth/kernels/__init__.py
@@ -37,14 +37,10 @@
     HAS_FLEX_ATTENTION,
     slow_attention_softcapping,
     slow_inference_attention_softcapping,
+    create_flex_attention_causal_mask,
+    create_flex_attention_sliding_window_mask,
 )
 
-if HAS_FLEX_ATTENTION:
-    from .flex_attention import (
-        FLEX_ATTENTION_PADDING,
-    )
-pass
-
 try:
     print("🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.")
 except:

diff --git a/unsloth/kernels/flex_attention.py b/unsloth/kernels/flex_attention.py
@@ -25,59 +25,120 @@
 }
 
 # Flex Attention supported from torch 2.5 onwards only
-import torch.nn
-if hasattr(torch.nn, "attention"):
-    import torch.nn.attention
-    if hasattr(torch.nn.attention, "flex_attention"):
-        import torch.nn.attention.flex_attention
-        from torch.nn.attention.flex_attention import flex_attention
-        from torch.nn.attention.flex_attention import create_block_mask
-        FLEX_ATTENTION_PADDING = getattr(
-            torch.nn.attention.flex_attention,
-            "_DEFAULT_SPARSE_BLOCK_SIZE",
-            1,
-        )
-        flex_attention = torch.compile(flex_attention, dynamic = False)
-        HAS_FLEX_ATTENTION = True
-    else:
-        HAS_FLEX_ATTENTION = False
-    pass
-else:
+try:
+    from torch.nn.attention.flex_attention import (
+        flex_attention as _flex_attention,
+        create_block_mask as _create_block_mask,
+    )
+    _flex_attention = torch.compile(_flex_attention, dynamic = True, options = torch_compile_options)
+    HAS_FLEX_ATTENTION = True
+except:
     HAS_FLEX_ATTENTION = False
 pass
 
-# Logit softcapping
-@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
-def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
-    n_heads    = self.num_heads
-    head_dim   = self.head_dim
-    n_kv_heads = self.num_key_value_heads
-    n_groups   = self.num_key_value_groups
-
-    # Grouped query attention
-    K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
-    V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
-    K = K.reshape(bsz, n_heads, q_len, head_dim)
-    V = V.reshape(bsz, n_heads, q_len, head_dim)
 
-    # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
-    # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
-    # We default to using the config file itself
-    # s = self.config.hidden_size // self.config.num_attention_heads
-    s = self.config.query_pre_attn_scalar
-    t = self.config.attn_logit_softcapping
+if not HAS_FLEX_ATTENTION:
 
-    Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
-    A = torch.matmul(Q, K.transpose(2, 3))
-    A = t * torch.tanh(A / t) # Logit softcapping
-    A += causal_mask[:q_len, :q_len]
-    # Much slower in torch compile!
-    # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
-    A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
-    A = torch.matmul(A, V)
-    A = A.transpose(1, 2).contiguous()
-    A = A.reshape(bsz, q_len, n_heads*head_dim)
-    return A
+    # Logit softcapping
+    @torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.num_heads
+        head_dim   = self.head_dim
+        n_kv_heads = self.num_key_value_heads
+        n_groups   = self.num_key_value_groups
+
+        # Grouped query attention
+        K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        K = K.reshape(bsz, n_heads, q_len, head_dim)
+        V = V.reshape(bsz, n_heads, q_len, head_dim)
+
+        # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+        # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+        # We default to using the config file itself
+        # s = self.config.hidden_size // self.config.num_attention_heads
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+
+        Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
+        A = torch.matmul(Q, K.transpose(2, 3))
+        A = t * torch.tanh(A / t) # Logit softcapping
+        A += causal_mask[:q_len, :q_len]
+        # Much slower in torch compile!
+        # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
+        A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
+        A = torch.matmul(A, V)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
+
+    create_flex_attention_causal_mask = None
+    create_flex_attention_sliding_window_mask = None
+else:
+    # See https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
+    # for more examples
+    # BSD 3-Clause License Copyright (c) 2023, Driss Guessous, Horace He et al
+    import functools, math
+
+    def generate_tanh_softcap(t):
+        def tanh_softcap(x, b, h, q_idx, kv_idx):
+            return t * torch.tanh(x / t)
+        return tanh_softcap
+    pass
+    def causal_masker(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+    pass
+
+    @functools.lru_cache
+    def sliding_window_masker(size = 4096):
+        def sliding_window(b, h, q_idx, kv_idx):
+            causal_mask = q_idx >= kv_idx
+            window_mask = q_idx - kv_idx <= size 
+            return causal_mask & window_mask
+        return sliding_window
+    pass
+
+    @functools.lru_cache
+    def create_block_mask(mask, n = 128):
+        return _create_block_mask(
+            mask, 1, 1, n, n,
+            BLOCK_SIZE = 128,
+            _compile = True,
+        )
+    pass
+
+    def create_flex_attention_causal_mask(max_seq_length = 8192):
+        causal_mask = create_block_mask(causal_masker, max_seq_length)
+        return causal_mask
+    pass
+
+    def create_flex_attention_sliding_window_mask(max_seq_length = 8192, sliding_window = 4096):
+        sliding_masker = sliding_window_masker(sliding_window)
+        causal_mask = create_block_mask(sliding_masker, max_seq_length)
+        return causal_mask
+    pass
+
+    @functools.lru_cache
+    def flex_attention(s, t):
+        scale = 1.0 / math.sqrt(s)
+        score_mod = generate_tanh_softcap(t)
+        return functools.partial(
+            _flex_attention, score_mod = score_mod, scale = scale, enable_gqa = True,
+        )
+    pass
+
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.num_heads
+        head_dim   = self.head_dim
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+        fx = flex_attention(s, t)
+        A = fx(query = Q, key = K, value = V, block_mask = causal_mask)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
 pass
 
 

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -330,17 +330,18 @@ def is_big_gpu(index):
     "config.coordinate_descent_tuning = True",
     "config.max_autotune_gemm = False", # GEMM is unnecessary
     "config.autotune_multi_device = False",
-    "config.max_autotune_gemm_backends = 'ATEN'", # Not much faster
+    "config.max_autotune_gemm_backends = 'TRITON,ATEN,CPP'", # Not much faster
     "config.aggressive_fusion = False", # Careful changes results!
     "config.cuda.enable_cuda_lto = True",
     "config.cuda.use_fast_math = True",
     "config.cuda.compile_opt_level = '-O2'",
 ]
 # Torch dynamo arguments
 torch_dynamo_arguments = [
-    "config.accumulated_cache_size_limit = 512", # Bump up a bit from 256
+    "config.accumulated_cache_size_limit = 1024", # Bump up a bit from 256
     "config.suppress_errors = True", # Supress errors for now
     "config.do_not_emit_runtime_asserts = True",
+    "config.cache_size_limit = 1024", # Flex Attention
 ]
 import torch._inductor.config as config
 for _try_compile_argument in torch_compile_arguments:

diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
@@ -156,7 +156,6 @@ def Gemma2Attention_fast_forward(
         )
         A = A.reshape(bsz, q_len, n_heads*head_dim)
     else:
-        mask = causal_mask if attention_mask is None else attention_mask
         fx = slow_inference_attention_softcapping \
             if "_flag_for_generation" in kwargs else \
             slow_attention_softcapping

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -711,12 +711,6 @@ def LlamaModel_fast_forward(
             offloaded_gradient_checkpointing = True
     pass
 
-    # Check for Flex Attention
-    # if IS_GEMMA2 and HAS_FLEX_ATTENTION:
-    #     if not (seq_length % FLEX_ATTENTION_PADDING == 0):
-    #     USE_FLEX_ATTENTION = True
-
-
     # Gemma2 has alternating SWA and global attn
     if IS_GEMMA2:
         if HAS_FLASH_ATTENTION_SOFTCAPPING and attention_mask is None:
@@ -738,23 +732,29 @@ def LlamaModel_fast_forward(
                 sliding_window = None,
             )
         elif not hasattr(self, "SWA_mask"):
-            n = self.max_seq_length # self.config.max_position_embeddings
-            # masked_fill is making stuff slower!
-            # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
-            # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
-            from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-            self.SWA_mask = AttentionMaskConverter(
-                is_causal = True,
-                sliding_window = self.config.sliding_window,
-            )\
-                .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
-                .squeeze(0).squeeze(0)
-
-            self.GA_mask = AttentionMaskConverter(
-                is_causal = True,
-            )\
-                .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
-                .squeeze(0).squeeze(0)
+            if HAS_FLEX_ATTENTION:
+                # Use Flex Attention instead!
+                self.SWA_mask = create_flex_attention_sliding_window_mask(self.max_seq_length, self.config.sliding_window)
+                self.GA_mask  = create_flex_attention_causal_mask(self.max_seq_length)
+            else:
+                n = self.max_seq_length # self.config.max_position_embeddings
+                # masked_fill is making stuff slower!
+                # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
+                # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
+                from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+                self.SWA_mask = AttentionMaskConverter(
+                    is_causal = True,
+                    sliding_window = self.config.sliding_window,
+                )\
+                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                    .squeeze(0).squeeze(0)
+
+                self.GA_mask = AttentionMaskConverter(
+                    is_causal = True,
+                )\
+                    .to_causal_4d(1, n, n, dtype = inputs_embeds.dtype, device = "cuda:0",)\
+                    .squeeze(0).squeeze(0)
+            pass
         pass
     pass
 
@@ -821,7 +821,7 @@ def custom_forward(*inputs):
             (fast_rms_layernorm_inference_gemma if IS_GEMMA else fast_rms_layernorm_inference)\
             (self.norm, hidden_states)
     elif IS_COHERE:
-        hidden_states = fast_layernorm_compiled(self.norm, hidden_states)
+        hidden_states = self.norm(hidden_states)
     else:
         hidden_states = fast_rms_layernorm(self.norm, hidden_states, gemma = IS_GEMMA)
     pass

diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
@@ -1144,7 +1144,7 @@ def patch_sft_trainer_tokenizer():
 
     # Patch train with fix_untrained_tokens
     for path_to_trainer in \
-        ("sft_trainer.SFTTrainer", "dpo_trainer.DPOTrainer",):
+        ("sft_trainer.SFTTrainer", "dpo_trainer.DPOTrainer", "kto_trainer.KTOTrainer"):
 
         function_name, replacer = "train", "if resume_from_checkpoint is False:"
         function = getsource(eval(f"trl.trainer.{path_to_trainer}.{function_name}"))