From be74ae4ea04021d91145ac1e62d778a503783c3e Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Fri, 19 Jul 2024 14:21:08 +0200
Subject: [PATCH 1/9] Henk's version of the fsize algo

This is the current version of the fsize algo based on Pyro's algorithm with added padding.
---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 4eedb233a2d5f..34b9a00a90c15 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -616,7 +616,7 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                     headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
                     ratio = mem/(fsize*csmul*1.5)
                     if headcount > 0:
-                        ratio = max(ratio,mem/(fsize*1.34 + (layers*headcount*headkvlen*cs*4.25)))
+                        ratio = max(ratio,mem/(fsize*1.025 + (layers*headcount*headkvlen*cs*4) + (layers*4*headkvlen*cs*4) + (1.5*1024*1024*1024)))
                     layerlimit = int(ratio*layers)
             else:
                 layerlimit = 200 # assume full offload

From ea86b082391680736717ffb3fd0c8b94ec433675 Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Fri, 19 Jul 2024 16:53:15 +0200
Subject: [PATCH 2/9] Update koboldcpp.py

Add debugs and bump padding
---
 koboldcpp.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 34b9a00a90c15..46d9b2130228c 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -615,8 +615,10 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                     headcount = ggufmeta[1]
                     headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
                     ratio = mem/(fsize*csmul*1.5)
+                    print(ratio)
                     if headcount > 0:
-                        ratio = max(ratio,mem/(fsize*1.025 + (layers*headcount*headkvlen*cs*4) + (layers*4*headkvlen*cs*4) + (1.5*1024*1024*1024)))
+                        ratio = max(ratio,mem/(fsize*1.1 + (layers*headcount*headkvlen*cs*4) + (layers*4*headkvlen*cs*4) + (1.5*1024*1024*1024)))
+                        print(ratio)
                     layerlimit = int(ratio*layers)
             else:
                 layerlimit = 200 # assume full offload

From 2bf4c09c966497bdb6948ce49773f765c33b4023 Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Fri, 19 Jul 2024 21:07:37 +0200
Subject: [PATCH 3/9] Pyro version

Pyro didn't agree with my version, so here is a test with his version
---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 46d9b2130228c..0410a2f7cf8ab 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -617,7 +617,7 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                     ratio = mem/(fsize*csmul*1.5)
                     print(ratio)
                     if headcount > 0:
-                        ratio = max(ratio,mem/(fsize*1.1 + (layers*headcount*headkvlen*cs*4) + (layers*4*headkvlen*cs*4) + (1.5*1024*1024*1024)))
+                        ratio = (mem - (1.0*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize*1.1 + (layers*headcount*headkvlen*cs*4))
                         print(ratio)
                     layerlimit = int(ratio*layers)
             else:

From 2beb2dc1b8f14f19e9307a26b00f3c8f2f1b438c Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Fri, 19 Jul 2024 22:46:44 +0200
Subject: [PATCH 4/9] Polish new auto layers

This one cleans up some debug prints, restores the max behavior in case the old alg suits someone better and changes the 200 layers to be the actual max for all backends so users have a better feel for the models.
---
 koboldcpp.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 0410a2f7cf8ab..a1777ea09e65d 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -615,13 +615,15 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                     headcount = ggufmeta[1]
                     headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
                     ratio = mem/(fsize*csmul*1.5)
-                    print(ratio)
                     if headcount > 0:
-                        ratio = (mem - (1.0*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize*1.1 + (layers*headcount*headkvlen*cs*4))
-                        print(ratio)
+                        ratio = max(ratio, mem - (1.0*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize*1.1 + (layers*headcount*headkvlen*cs*4))
                     layerlimit = int(ratio*layers)
             else:
-                layerlimit = 200 # assume full offload
+                ggufmeta = read_gguf_metadata(filepath)
+                if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
+                    layerlimit = 200 # assume full offload
+                else:
+                    layerlimit = ggufmeta[0] + 3
         return layerlimit
     except Exception as ex:
         return 0

From 29b83ea0f92fa64ad802563a6aa0924ef44fa9fa Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Fri, 19 Jul 2024 22:54:33 +0200
Subject: [PATCH 5/9] Remove 10% margin

The new version has been much more accurate, for low vram systems I only notice 1 layer difference. Getting rid of it so users can test if its still in safe margins like I expect. On a 6GB system it results in 18 layers instead of 17 being chosen for Tiefighter.
---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index a1777ea09e65d..f43b4eb21fdec 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -616,7 +616,7 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                     headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
                     ratio = mem/(fsize*csmul*1.5)
                     if headcount > 0:
-                        ratio = max(ratio, mem - (1.0*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize*1.1 + (layers*headcount*headkvlen*cs*4))
+                        ratio = max(ratio, mem - (1.0*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize + (layers*headcount*headkvlen*cs*4))
                     layerlimit = int(ratio*layers)
             else:
                 ggufmeta = read_gguf_metadata(filepath)

From cc6bb2337e751b0184bda05c905bca7133fca273 Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Sat, 20 Jul 2024 12:12:34 +0200
Subject: [PATCH 6/9] Restore 500MB buffer to play it safe

I'm not feeling confident most people keep their vram usage under 1GB with background tasks. For now since we are aiming to have it work on as many systems as possible I restore the 500MB extra space since the fsize inflation is gone.
---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index f43b4eb21fdec..fd18ad5737ac2 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -616,7 +616,7 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                     headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
                     ratio = mem/(fsize*csmul*1.5)
                     if headcount > 0:
-                        ratio = max(ratio, mem - (1.0*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize + (layers*headcount*headkvlen*cs*4))
+                        ratio = max(ratio, mem - (1.5*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize + (layers*headcount*headkvlen*cs*4))
                     layerlimit = int(ratio*layers)
             else:
                 ggufmeta = read_gguf_metadata(filepath)

From 6499c9d21a586081d4041eca24084ea4420b2570 Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Sat, 20 Jul 2024 13:38:00 +0200
Subject: [PATCH 7/9] Cap layers at maximum

When using the auto predict we don't want to go over the maximum amount of layers. Users should have a realistic feel for how large the model is.

For example when I was using the new auto guesser to communicate if a larger model would fit on someone's system at a higher context, it originally made me think that the model had 60 layers. In reality it had less.

This commit will take the layers of the model, and add 3 extra since that is the highest amount of additional layers a backend adds for the context handling (Most its 1).
---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index fd18ad5737ac2..a4e1a0fa48d73 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -617,7 +617,7 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                     ratio = mem/(fsize*csmul*1.5)
                     if headcount > 0:
                         ratio = max(ratio, mem - (1.5*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize + (layers*headcount*headkvlen*cs*4))
-                    layerlimit = int(ratio*layers)
+                    layerlimit = min(int(ratio*layers), (layers + 3))
             else:
                 ggufmeta = read_gguf_metadata(filepath)
                 if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers

From 22b39752b7d0337ffd749ddf68c40be3420807f8 Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Sat, 20 Jul 2024 15:58:24 +0200
Subject: [PATCH 8/9] Remove old max layer code

Turns out at extreme contexts on new models such as Nemo the old code is incorrectly assuming we can offload everything. Its also redundant to check for max layers the old way since I capped our new guesses.

Old code is now removed to simplify it, and it changed the nemo guess from 43 layers to 15 layers. Still looking into the 15 part, still seems to high but can be the old algo taking over.
---
 koboldcpp.py | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index a4e1a0fa48d73..24e17a93a6b10 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -605,25 +605,18 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                 csmul = 1.2
             elif cs and cs > 2048:
                 csmul = 1.1
-            if mem < fsize*1.6*csmul:
-                ggufmeta = read_gguf_metadata(filepath)
-                if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
-                    sizeperlayer = fsize*csmul*0.052
-                    layerlimit = int(min(200,mem/sizeperlayer))
-                else:
-                    layers = ggufmeta[0]
-                    headcount = ggufmeta[1]
-                    headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
-                    ratio = mem/(fsize*csmul*1.5)
-                    if headcount > 0:
-                        ratio = max(ratio, mem - (1.5*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize + (layers*headcount*headkvlen*cs*4))
-                    layerlimit = min(int(ratio*layers), (layers + 3))
+            ggufmeta = read_gguf_metadata(filepath)
+            if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
+                sizeperlayer = fsize*csmul*0.052
+                layerlimit = int(min(200,mem/sizeperlayer))
             else:
-                ggufmeta = read_gguf_metadata(filepath)
-                if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
-                    layerlimit = 200 # assume full offload
-                else:
-                    layerlimit = ggufmeta[0] + 3
+                layers = ggufmeta[0]
+                headcount = ggufmeta[1]
+                headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
+                ratio = mem/(fsize*csmul*1.5)
+                if headcount > 0:
+                    ratio = max(ratio, mem - (1.5*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize + (layers*headcount*headkvlen*cs*4))
+                layerlimit = min(int(ratio*layers), (layers + 3))
         return layerlimit
     except Exception as ex:
         return 0

From d5dd00c2cbe19f6f9e4baaa2b6d40f2437fb63bc Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Sun, 21 Jul 2024 16:51:43 +0200
Subject: [PATCH 9/9] Restructure algorithm into multiple parts

As requested the different calculations in the algorithm now have their own sections and names so its easier to understand what parts are being used. This also fixes the typo that was caused as a result of it being harder to read, the typo made no difference during execution and the algorithm is confirmed to still work the same.
---
 koboldcpp.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 24e17a93a6b10..067768bd646ac 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -614,8 +614,11 @@ def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how m
                 headcount = ggufmeta[1]
                 headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
                 ratio = mem/(fsize*csmul*1.5)
+                computemem = layers*4*headkvlen*cs*4*1.25 # For now the first 4 is the hardcoded result for a blasbatchsize of 512. Ideally we automatically calculate blasbatchsize / 4 but I couldn't easily grab the value yet - Henk
+                contextmem = layers*headcount*headkvlen*cs*4
+                reservedmem = 1.5*1024*1024*1024 # Users often don't have their GPU's VRAM worth of memory, we assume 500MB to avoid driver swapping + 500MB for the OS + 500MB for background apps / browser - Henk
                 if headcount > 0:
-                    ratio = max(ratio, mem - (1.5*1024*1024*1024) - (layers*4*headkvlen*cs*4*1.25))/(fsize + (layers*headcount*headkvlen*cs*4))
+                    ratio = max(ratio, (mem - reservedmem - computemem) / (fsize + contextmem))
                 layerlimit = min(int(ratio*layers), (layers + 3))
         return layerlimit
     except Exception as ex: