Moved input ID chunking to model, added logic to limit size of attent…

…ion for very long contexts
turboderp · Jun 28, 2023 · 93d50d1 · aljungberg · Jun 28, 2023 · turboderp
1 parent a01b25c
commit 93d50d1
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 23 deletions.
diff --git a/generator.py b/generator.py
@@ -4,8 +4,6 @@
 import torch
 import torch.nn.functional as F
 
-DEFAULT_MAX_CHUNK = 2048
-
 class ExLlamaGenerator:
 
     class Settings:
@@ -162,20 +160,15 @@ def disallow_tokens(self, tokens):
         self.disallowed_tokens = tokens
 
 
-    def gen_begin(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
+    def gen_begin(self, in_tokens):
 
         self.end_beam_search()
 
         self.sequence = in_tokens.clone()
         self.sequence_actual = in_tokens.clone()
         self.cache.current_seq_len = 0
 
-        if in_tokens.shape[-1] > 1:
-            a = 0
-            while a < self.sequence.shape[-1] - 1:
-                b = min(a + max_chunk, self.sequence.shape[-1] - 1)
-                self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
-                a = b
+        self.model.forward(self.sequence[:, :-1], self.cache, preprocess_only = True, lora = self.lora)
 
 
     def gen_begin_empty(self):
@@ -186,11 +179,11 @@ def gen_begin_empty(self):
         self.cache.current_seq_len = 0
 
 
-    def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
+    def gen_begin_reuse(self, in_tokens):
 
         self.end_beam_search()
         if self.sequence is None or self.cache.current_seq_len == 0:
-            self.gen_begin(in_tokens, max_chunk)
+            self.gen_begin(in_tokens)
             return 0
 
         # if in_tokens.shape[-1] < self.sequence.shape[-1]:
@@ -201,7 +194,7 @@ def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
             reuse += 1
 
         if reuse < 2:
-            self.gen_begin(in_tokens, max_chunk)
+            self.gen_begin(in_tokens)
             return 0
 
         # print (f"Reusing cache: {reuse} tokens")
@@ -214,10 +207,10 @@ def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
         return reuse
 
 
-    def gen_feed_tokens(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
+    def gen_feed_tokens(self, in_tokens):
 
         if self.sequence is None:
-            self.gen_begin(in_tokens, max_chunk)
+            self.gen_begin(in_tokens)
             return
 
         self.end_beam_search()
@@ -229,11 +222,8 @@ def gen_feed_tokens(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
         else:
             self.sequence = torch.cat((self.sequence, in_tokens), dim = 1)
 
-        a = start
-        while a < self.sequence.shape[-1] - 1:
-            b = min(a + max_chunk, self.sequence.shape[-1] - 1)
-            self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
-            a = b
+        if start < self.sequence.shape[-1] - 1:
+            self.model.forward(self.sequence[:, start : -1], self.cache, preprocess_only = True, lora = self.lora)
 
         self.sequence_actual = self.sequence
 

diff --git a/model.py b/model.py
@@ -68,9 +68,11 @@ def __init__(self, model_config_path):
 
         # Optional settings
 
-        self.max_seq_len = 2048  # Reduce to save memory. Can also be increased, but the pretrained models produce degenerate output after 2048 tokens in any case. Should be possible to finetune for longer sequence lengths.
+        self.max_seq_len = 2048  # Reduce to save memory. Can also be increased, ideally while also using compress_pos_emn and a compatible model/LoRA
+        self.max_input_len = 2048  # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps
+        self.max_attention_size = 2048**2  # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this
         self.compress_pos_emb = 1.0  # Increase to compress positional embeddings applied to sequence
-        self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to move tensors via system RAM instead, where needed
+        self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
 
         # Tuning
@@ -779,7 +781,7 @@ def __init__(self, config):
             device_buffers = {}
             self.buffers.append(device_buffers)
 
-            temp_state = torch.zeros((config.max_seq_len, config.intermediate_size), dtype = torch.float16, device = dev)
+            temp_state = torch.zeros((config.max_input_len, config.intermediate_size), dtype = torch.float16, device = dev)
             temp_mlp = torch.zeros((config.fused_mlp_thd * 2, config.intermediate_size), dtype = torch.float16, device = dev)
             temp_zeros_float = torch.zeros((1, 65536), dtype = torch.float32, device = dev)
             temp_dq = torch.zeros((1, max_dq_buffer_size), dtype = torch.float16, device = dev)
@@ -800,7 +802,70 @@ def __init__(self, config):
         torch.cuda.empty_cache()
 
 
-    def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None, output_device = None, input_mask = None):
+    def forward(self,
+                input_ids,
+                cache,
+                last_id_only = True,
+                preprocess_only = False,
+                lora = None,
+                output_device = None,
+                input_mask = None):
+
+        q_len = input_ids.shape[-1]
+        remaining_q_len = q_len
+
+        # Split forward pass
+
+        result = None
+
+        chunk_begin = 0
+        while chunk_begin < q_len:
+
+            # Limit chunk_size to max_input_len
+
+            chunk_size = min(remaining_q_len, self.config.max_input_len)
+
+            # Limit chunk_size to keep size of attention operation <= max_attention_size
+
+            past_len = cache.current_seq_len
+            attn_size = (past_len + remaining_q_len) * remaining_q_len
+            max_a = self.config.max_attention_size
+            if attn_size > max_a:
+                cs = (math.sqrt(past_len ** 2 + 4 * max_a) - past_len) / 2
+                chunk_size = math.floor(cs)
+
+            # Process chunk
+
+            chunk_end = min(chunk_begin + chunk_size, q_len)
+
+            _last_id_only = last_id_only
+            _preprocess_only = preprocess_only or (chunk_end < q_len and last_id_only)
+
+            r = self._forward(input_ids[:, chunk_begin : chunk_end],
+                             cache,
+                             _last_id_only,
+                             _preprocess_only,
+                             lora,
+                             output_device,
+                             input_mask)
+
+            if not _preprocess_only:
+                result = r if result is None else torch.cat((result, r), dim = -1)
+
+            chunk_begin = chunk_end
+            remaining_q_len -= chunk_size
+
+        return result
+
+
+    def _forward(self,
+                 input_ids,
+                 cache,
+                 last_id_only = True,
+                 preprocess_only = False,
+                 lora = None,
+                 output_device = None,
+                 input_mask = None):
 
         assert input_mask is None or input_mask.shape == input_ids.shape