Skip to content

Commit

Permalink
Moved input ID chunking to model, added logic to limit size of attent…
Browse files Browse the repository at this point in the history
…ion for very long contexts
  • Loading branch information
turboderp committed Jun 28, 2023
1 parent a01b25c commit 93d50d1
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 23 deletions.
28 changes: 9 additions & 19 deletions generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import torch
import torch.nn.functional as F

DEFAULT_MAX_CHUNK = 2048

class ExLlamaGenerator:

class Settings:
Expand Down Expand Up @@ -162,20 +160,15 @@ def disallow_tokens(self, tokens):
self.disallowed_tokens = tokens


def gen_begin(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
def gen_begin(self, in_tokens):

self.end_beam_search()

self.sequence = in_tokens.clone()
self.sequence_actual = in_tokens.clone()
self.cache.current_seq_len = 0

if in_tokens.shape[-1] > 1:
a = 0
while a < self.sequence.shape[-1] - 1:
b = min(a + max_chunk, self.sequence.shape[-1] - 1)
self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
a = b
self.model.forward(self.sequence[:, :-1], self.cache, preprocess_only = True, lora = self.lora)


def gen_begin_empty(self):
Expand All @@ -186,11 +179,11 @@ def gen_begin_empty(self):
self.cache.current_seq_len = 0


def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
def gen_begin_reuse(self, in_tokens):

self.end_beam_search()
if self.sequence is None or self.cache.current_seq_len == 0:
self.gen_begin(in_tokens, max_chunk)
self.gen_begin(in_tokens)
return 0

# if in_tokens.shape[-1] < self.sequence.shape[-1]:
Expand All @@ -201,7 +194,7 @@ def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
reuse += 1

if reuse < 2:
self.gen_begin(in_tokens, max_chunk)
self.gen_begin(in_tokens)
return 0

# print (f"Reusing cache: {reuse} tokens")
Expand All @@ -214,10 +207,10 @@ def gen_begin_reuse(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
return reuse


def gen_feed_tokens(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
def gen_feed_tokens(self, in_tokens):

if self.sequence is None:
self.gen_begin(in_tokens, max_chunk)
self.gen_begin(in_tokens)
return

self.end_beam_search()
Expand All @@ -229,11 +222,8 @@ def gen_feed_tokens(self, in_tokens, max_chunk = DEFAULT_MAX_CHUNK):
else:
self.sequence = torch.cat((self.sequence, in_tokens), dim = 1)

a = start
while a < self.sequence.shape[-1] - 1:
b = min(a + max_chunk, self.sequence.shape[-1] - 1)
self.model.forward(self.sequence[:, a:b], self.cache, preprocess_only = True, lora = self.lora)
a = b
if start < self.sequence.shape[-1] - 1:
self.model.forward(self.sequence[:, start : -1], self.cache, preprocess_only = True, lora = self.lora)

self.sequence_actual = self.sequence

Expand Down
73 changes: 69 additions & 4 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@ def __init__(self, model_config_path):

# Optional settings

self.max_seq_len = 2048 # Reduce to save memory. Can also be increased, but the pretrained models produce degenerate output after 2048 tokens in any case. Should be possible to finetune for longer sequence lengths.
self.max_seq_len = 2048 # Reduce to save memory. Can also be increased, ideally while also using compress_pos_emn and a compatible model/LoRA
self.max_input_len = 2048 # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps
self.max_attention_size = 2048**2 # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this
self.compress_pos_emb = 1.0 # Increase to compress positional embeddings applied to sequence
self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to move tensors via system RAM instead, where needed
self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed

This comment has been minimized.

Copy link
@aljungberg

aljungberg Jun 28, 2023

Contributor

You might be interested to know that this could be a driver issue. If you're using 2x RTX 4090, there has been a driver bug in Linux causing corrupt results. Seems like although nvidia doesn't want their latest generation consumer cards to be able to do peer to peer transfers, they forgot to actually disable it fully in the driver. So the data transfer would appear to succeed but you'd be reading uninitialised memory on the next card. 525.105.17 and above should fix this.

This comment has been minimized.

Copy link
@turboderp

turboderp Jun 28, 2023

Author Owner

Well, I never found out what it was, and I'm not sure it's this bug. Someone was having an issue (#10) on 530.41.03 which would cause tensor.to() to create empty tensors on the target device instead of copying any data across. It may be related to IOMMU settings somehow, but I'm not sure.

I can't get peer access working between my 4090 and 3090Ti, sadly, but Torch seems to just implicitly move tensors via system RAM as needed. I guess some combination of settings can cause it to assume peer access is supported when it isn't.

This comment has been minimized.

Copy link
@jmoney7823956789378

jmoney7823956789378 Jun 28, 2023

Just to confirm, the intended scenario for multi-gpu is pure GPU <-> GPU transfers for tensors, but certain scenarios have issues with this method, and they use GPU <-> RAM <-> GPU?
If so, how would you confirm which method is being used for inferencing in any given hardware environment?

This comment has been minimized.

Copy link
@turboderp

turboderp Jun 28, 2023

Author Owner

I don't actually know if Torch exposes this somehow. But you could just try a simple script that copies a large tensor back and forth a few times and check if tensor.to("cuda:1").to("cuda:0") is any faster than tensor.to("cpu").to("cuda:1).to("cpu").to("cuda:0).

For ExLlama it doesn't matter too much since it's only the hidden state that moves from device to device, and only once, where there model is split. In a single-token forward pass that tensor is some kilobytes in size, so the bandwidth requirement is negligible.

Ideally you'd want to use two or more devices in parallel, and I already have most of the code written up for that, but it's much too slow without peer access, requiring synchronization before and after every matmul.

This comment has been minimized.

Copy link
@jmoney7823956789378

jmoney7823956789378 Jun 28, 2023

But you could just try a simple script that copies a large tensor back and forth a few times and check if tensor.to("cuda:1").to("cuda:0") is any faster than tensor.to("cpu").to("cuda:1).to("cpu").to("cuda:0).

Unfortunately, I'm only a python reader, not a writer.
I feel bad asking, but I'm sure you've written this script once or twice before... would you be able to drop it in a discussion thread for testing?

This comment has been minimized.

Copy link
@aljungberg

aljungberg Jun 28, 2023

Contributor

Well, I never found out what it was, and I'm not sure it's this bug. Someone was having an issue (#10) on 530.41.03 which would cause tensor.to() to create empty tensors on the target device instead of copying any data across. It may be related to IOMMU settings somehow, but I'm not sure.

An empty tensor is an uninitialised tensor so that sounds exactly like the problem. 530.41.03 is one of the affected drivers: "The current 530.xx driver - 530.41.03 31 does not include the [fix]."

And I see the person in #10 is indeed using dual RTX 4090 cards. So it's exactly the recipe for triggering this peer to peer bug This was breaking all my fine-tuning on multi-GPU with qlora for a while.

I can't get peer access working between my 4090 and 3090Ti, sadly, but Torch seems to just implicitly move tensors via system RAM as needed. I guess some combination of settings can cause it to assume peer access is supported when it isn't.

The driver is supposed to report if peer access works or not, and in this case it reports that it does and while in reality it doesn't. Pytorch's only fault is believing the driver, I think. It shouldn't work with your setup since it includes a 4090 and, it seems, nvidia is actively blocking this feature on 4090s. If you tried it with two 3090s I think it would work.

This comment has been minimized.

Copy link
@turboderp

turboderp Jun 28, 2023

Author Owner

@jmoney7823956789378 Stuck a little script in #113

self.auto_map = None # List of floats with memory allocation in GB, per CUDA device, overrides device_map

# Tuning
Expand Down Expand Up @@ -779,7 +781,7 @@ def __init__(self, config):
device_buffers = {}
self.buffers.append(device_buffers)

temp_state = torch.zeros((config.max_seq_len, config.intermediate_size), dtype = torch.float16, device = dev)
temp_state = torch.zeros((config.max_input_len, config.intermediate_size), dtype = torch.float16, device = dev)
temp_mlp = torch.zeros((config.fused_mlp_thd * 2, config.intermediate_size), dtype = torch.float16, device = dev)
temp_zeros_float = torch.zeros((1, 65536), dtype = torch.float32, device = dev)
temp_dq = torch.zeros((1, max_dq_buffer_size), dtype = torch.float16, device = dev)
Expand All @@ -800,7 +802,70 @@ def __init__(self, config):
torch.cuda.empty_cache()


def forward(self, input_ids, cache, last_id_only = True, preprocess_only = False, lora = None, output_device = None, input_mask = None):
def forward(self,
input_ids,
cache,
last_id_only = True,
preprocess_only = False,
lora = None,
output_device = None,
input_mask = None):

q_len = input_ids.shape[-1]
remaining_q_len = q_len

# Split forward pass

result = None

chunk_begin = 0
while chunk_begin < q_len:

# Limit chunk_size to max_input_len

chunk_size = min(remaining_q_len, self.config.max_input_len)

# Limit chunk_size to keep size of attention operation <= max_attention_size

past_len = cache.current_seq_len
attn_size = (past_len + remaining_q_len) * remaining_q_len
max_a = self.config.max_attention_size
if attn_size > max_a:
cs = (math.sqrt(past_len ** 2 + 4 * max_a) - past_len) / 2
chunk_size = math.floor(cs)

# Process chunk

chunk_end = min(chunk_begin + chunk_size, q_len)

_last_id_only = last_id_only
_preprocess_only = preprocess_only or (chunk_end < q_len and last_id_only)

r = self._forward(input_ids[:, chunk_begin : chunk_end],
cache,
_last_id_only,
_preprocess_only,
lora,
output_device,
input_mask)

if not _preprocess_only:
result = r if result is None else torch.cat((result, r), dim = -1)

chunk_begin = chunk_end
remaining_q_len -= chunk_size

return result


def _forward(self,
input_ids,
cache,
last_id_only = True,
preprocess_only = False,
lora = None,
output_device = None,
input_mask = None):

assert input_mask is None or input_mask.shape == input_ids.shape

Expand Down

0 comments on commit 93d50d1

Please sign in to comment.