Share gpu (#75)

* share gpu initial * list of model statuses for one device * share gpu + multiple statuses * gui-backend integration fixes * add estimated required memory * low memory alert * check if models data is ready * space fix * jumping status fix --------- Co-authored-by: oxyplay <max@oxyplay.com>
smallcloudai · Aug 18, 2023 · d09e260 · d09e260
1 parent dc56fa8
commit d09e260
Show file tree

Hide file tree

Showing 8 changed files with 176 additions and 75 deletions.
diff --git a/known_models_db/refact_known_models/huggingface.py b/known_models_db/refact_known_models/huggingface.py
@@ -5,6 +5,7 @@
         "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
+        "required_memory_mb": 18000,
         "filter_caps": ["completion"],
     },
     "starcoder/15b/plus": {
@@ -13,6 +14,7 @@
         "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
+        "required_memory_mb": 18000,
         "filter_caps": ["completion"],
     },
     "starchat/15b/beta": {
@@ -21,6 +23,7 @@
         "diff_scratchpad_class": None,
         "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceStarChat",
         "model_class_kwargs": {},
+        "required_memory_mb": 18000,
         "filter_caps": ["starchat"],
     },
     "wizardcoder/15b": {
@@ -29,6 +32,7 @@
         "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
+        "required_memory_mb": 18000,
         "filter_caps": ["completion"],
     },
     "wizardlm/7b": {
@@ -39,6 +43,7 @@
         "model_class_kwargs": {
             "model_basename": "wizardlm-7b-v1.0-uncensored-GPTQ-4bit-128g.no-act.order",
         },
+        "required_memory_mb": 8000,
         "filter_caps": ["wizardlm"],
     },
     "wizardlm/13b": {
@@ -49,6 +54,7 @@
         "model_class_kwargs": {
             "model_basename": "wizardlm-13b-v1.1-GPTQ-4bit-128g.no-act.order",
         },
+        "required_memory_mb": 14000,
         "filter_caps": ["wizardlm"],
     },
     "llama2/7b": {
@@ -59,6 +65,7 @@
         "model_class_kwargs": {
             "model_basename": "gptq_model-4bit-128g",
         },
+        "required_memory_mb": 8000,
         "filter_caps": ["llama2"],
     },
     "llama2/13b": {
@@ -69,6 +76,7 @@
         "model_class_kwargs": {
             "model_basename": "gptq_model-4bit-128g",
         },
+        "required_memory_mb": 14000,
         "filter_caps": ["llama2"],
     },
     "wizardlm/30b/4bit": {

diff --git a/known_models_db/refact_known_models/refact.py b/known_models_db/refact_known_models/refact.py
@@ -6,6 +6,7 @@
         "chat_scratchpad_class": None,
         "model_class": "refact_models:CodifyModel",
         "T": 2048,
+        "required_memory_mb": 3500,
         "filter_caps": ["CONTRASTcode", "completion"],
     },
 
@@ -16,6 +17,7 @@
         "chat_scratchpad_class": None,
         "model_class": "refact_models:CodifyModel",
         "T": 2048,
+        "required_memory_mb": 8500,
         "filter_caps": ["CONTRASTcode", "completion", "finetune"],
     },
 

diff --git a/self_hosting_machinery/watchdog/docker_watchdog.py b/self_hosting_machinery/watchdog/docker_watchdog.py
@@ -6,7 +6,7 @@
 import sys
 import time
 import uuid
-from typing import Dict, Optional
+from typing import Dict, Optional, List
 
 from self_hosting_machinery import env
 
@@ -188,7 +188,7 @@ def maybe_can_start(self):
             if can_start:
                 self._start()
         elif "always_on_low_priority" in policy:
-            can_start = low_priority_can_start(self.cfg.get("gpus", []))
+            can_start = low_priority_can_start(self)
             if can_start:
                 self._start()
         elif "at_night" in policy:
@@ -197,6 +197,15 @@ def maybe_can_start(self):
             if self.start_ts + self.cfg["restart_every"] < time.time():
                 self._start()
 
+    def __str__(self):
+        return f"TrackedJob:\n" \
+               f"  pid: {self.p.pid if self.p else None}\n" \
+               f"  cmd: '{self.cmdline_str}'\n" \
+               f"  start_ts: {self.start_ts}\n" \
+               f"  cfg: {self.cfg}\n" \
+               f"  shutdown: {self.please_shutdown}\n" \
+               f"  remove: {self.remove_this}\n" \
+               f"  status: {self.status_from_stderr}\n"
 
 
 tracked: Dict[str, TrackedJob] = {}
@@ -244,11 +253,13 @@ def preempt_low_priority(gpus):
     return can_start
 
 
-def low_priority_can_start(gpus):
+def low_priority_can_start(job: TrackedJob):
     can_start = True
-    for job in tracked.values():
-        if set(gpus) & set(job.cfg["gpus"]):
-            if job.p is not None:
+    for tracked_job in tracked.values():
+        if job.cfg.get("share_gpu", False) and tracked_job.cfg.get("share_gpu", False):
+            continue
+        if set(job.cfg.get("gpus", [])) & set(tracked_job.cfg.get("gpus", [])):
+            if tracked_job.p is not None:
                 can_start = False
     return can_start
 
@@ -258,26 +269,20 @@ def low_priority_can_start(gpus):
 
 def inform_about_gpu_status():
     global _inform_about_gpu_status
-    MAX = 16
-    gpu_command = [""] * MAX
-    gpu_status = [""] * MAX
+    gpu_status: Dict[int, List[Dict]] = {}
     for job in tracked.values():
         if job.p is None:
             continue
-        for gpu in job.cfg["gpus"]:
-            if gpu >= 0 and gpu < len(gpu_status):
+        for gpu in map(int, job.cfg["gpus"]):
+            if gpu >= 0:
                 t = job.cmdline_str
                 if t.startswith("python -m"):
                     t = t[len("python -m"):]
-                gpu_command[gpu] = t.strip()
-                gpu_status[gpu] = job.status_from_stderr
-    j = {"gpus": [{}]*16}
-    for i in range(MAX):
-        j["gpus"][i] = {
-            "command": gpu_command[i],
-            "status": gpu_status[i],
-        }
-    s = json.dumps(j, indent=4) + "\n"
+                gpu_status.setdefault(gpu, []).append({
+                    "command": t.strip(),
+                    "status": job.status_from_stderr,
+                })
+    s = json.dumps({"gpus": gpu_status}, indent=4) + "\n"
     if s != _inform_about_gpu_status:
         with open(env.CONFIG_BUSY_GPUS + ".tmp", "w") as f:
             f.write(s)

diff --git a/self_hosting_machinery/watchdog/watchdog.d/model.cfg b/self_hosting_machinery/watchdog/watchdog.d/model.cfg
@@ -3,5 +3,6 @@
     "command_line": ["python", "-m", "self_hosting_machinery.inference.inference_worker"],
     "unfinished": true,
     "needs_compile": true,
-    "gpus": []
+    "gpus": [],
+    "share_gpu": false
 }
diff --git a/self_hosting_machinery/webgui/static/style.css b/self_hosting_machinery/webgui/static/style.css
@@ -233,7 +233,7 @@ h3 {
 }
 .gpus-item {
     display: flex;
-    justify-content: center;
+    justify-content: flex-start;
     text-align: center;
     border: 1px solid rgba(0,0,0,0.15);
     border-radius: 8px;
@@ -626,6 +626,11 @@ h3 {
 }
 .model-hosting-error {
     color: darkred;
+    text-align: right;
+}
+.model-memory-error {
+    color: darkred;
+    text-align: right;
 }
 .disabled-group {
     opacity: 0.4;

diff --git a/self_hosting_machinery/webgui/static/tab-model-hosting.html b/self_hosting_machinery/webgui/static/tab-model-hosting.html
@@ -10,6 +10,7 @@ <h3>Hosted Models</h3>
         <th>Model</th>
         <th>Completion</th>
         <th>Sharding</th>
+        <th>Sharing</th>
         <th></th>
       </tr>
     </thead>
@@ -19,7 +20,10 @@ <h3>Hosted Models</h3>
   </table>
   <div class="model-hosting-controls">
     <button data-bs-toggle="modal" data-bs-target="#add-model-modal" class="btn btn-primary model-hosting-add"><i class="bi bi-plus-lg"></i> Add Model</button>
-    <div class="model-hosting-error d-none"><i class="bi bi-exclamation-triangle-fill"></i> You have more models selected than GPUs available.</div>
+    <div>
+      <div class="model-hosting-error"><i class="bi bi-exclamation-triangle-fill"></i> You have more models selected than GPUs available.</div>
+      <div class="model-memory-error"><i class="bi bi-exclamation-triangle-fill"></i> Required memory exceeds the GPU's memory.</div>
+    </div>
   </div>
 </div>
 <div class="pane">

diff --git a/self_hosting_machinery/webgui/static/tab-model-hosting.js b/self_hosting_machinery/webgui/static/tab-model-hosting.js
@@ -34,37 +34,39 @@ function render_gpus(gpus) {
         gpu_mem.classList.add('gpus-mem');
         const gpu_temp = document.createElement("div");
         gpu_temp.classList.add('gpus-temp');
-        const gpu_command = document.createElement("div");
-        gpu_command.classList.add('gpus-command');
-        const gpu_status = document.createElement("div");
-        gpu_status.classList.add('gpus-status');
-
         const used_gb = format_memory(element.mem_used_mb);
         const total_gb = format_memory(element.mem_total_mb);
         const used_mem = Math.round(element.mem_used_mb / (element.mem_total_mb / 100));
         gpu_name.innerHTML = element.name;
         gpu_mem.innerHTML = `<b>Mem</b><div class="gpus-mem-wrap"><div class="gpus-mem-bar"><span style="width: ${used_mem}%"></span></div>${used_gb}/${total_gb} GB</div>`;
         gpu_temp.innerHTML = `<b>Temp</b>` + element.temp_celsius + '°C';
-        gpu_command.innerHTML = `<span class="gpus-current-status">${element.status}</span>`;
-        gpu_status.innerHTML += `<div><b>Command</b>${element.command}</div>`;
-        gpu_status.innerHTML += `<div><b>Status</b>${element.status}</div>`;
-        gpu_command.appendChild(gpu_status);
-        gpu_command.addEventListener('mouseover',function(e) {
-            gpus_popup = true;
-            this.querySelector('.gpus-status').classList.add('gpus-status-visible');
-        });
-        gpu_command.addEventListener('mouseout',function(e) {
-            gpus_popup = false;
-            this.querySelector('.gpus-status').classList.remove('gpus-status-visible');
-        });
-        if(!element.status || element.status === '') {
-            gpu_command.classList.add('gpus-status-invisible');
-        }
         row.appendChild(gpu_image);
         gpu_wrapper.appendChild(gpu_name);
         gpu_wrapper.appendChild(gpu_mem);
         gpu_wrapper.appendChild(gpu_temp);
-        gpu_wrapper.appendChild(gpu_command);
+        element.statuses.forEach(status => {
+            const gpu_command = document.createElement("div");
+            gpu_command.classList.add('gpus-command');
+            const gpu_status = document.createElement("div");
+            gpu_status.classList.add('gpus-status');
+            gpu_command.innerHTML = `<span class="gpus-current-status">${status.status}</span>`;
+            gpu_status.innerHTML += `<div><b>Command</b>${status.command}</div>`;
+            gpu_status.innerHTML += `<div><b>Status</b>${status.status}</div>`;
+            gpu_command.appendChild(gpu_status);
+            gpu_command.addEventListener('mouseover',function(e) {
+                gpus_popup = true;
+                this.querySelector('.gpus-status').classList.add('gpus-status-visible');
+            });
+            gpu_command.addEventListener('mouseout',function(e) {
+                gpus_popup = false;
+                this.querySelector('.gpus-status').classList.remove('gpus-status-visible');
+            });
+            if(!status.status || status.status === '') {
+                gpu_command.classList.add('gpus-status-invisible');
+            }
+            gpu_wrapper.appendChild(gpu_command);
+        });
+
         row.appendChild(gpu_wrapper);
         gpus_list.appendChild(row);
     });
@@ -84,11 +86,17 @@ function get_models()
         enable_chat_gpt_switch.checked = models_data['openai_api_enable'];
         enable_chat_gpt_switch.addEventListener('change', save_model_assigned);
         const more_gpus_notification = document.querySelector('.model-hosting-error');
-        if(models_data.more_models_than_gpus) {
+        if(models_data && models_data.length > 0 && models_data.more_models_than_gpus) {
             more_gpus_notification.classList.remove('d-none');
         } else {
             more_gpus_notification.classList.add('d-none');
         }
+        const required_memory_exceed_available = document.querySelector('.model-memory-error');
+        if(models_data && models_data.length > 0 && models_data.required_memory_exceed_available) {
+            required_memory_exceed_available.classList.remove('d-none');
+        } else {
+            required_memory_exceed_available.classList.add('d-none');
+        }
     });
 }
 
@@ -187,6 +195,24 @@ function render_models_assigned(models) {
         //     // models_gpus_change = true;
         // });
         gpus.appendChild(gpus_input);
+        const gpus_share = document.createElement("td");
+        const gpus_checkbox = document.createElement("input");
+        gpus_checkbox.setAttribute('type','checkbox');
+        gpus_checkbox.setAttribute('value',index);
+        gpus_checkbox.setAttribute('name',`share-${index}`);
+        gpus_checkbox.classList.add('form-check-input');
+        if(models_data.model_assign[index].share_gpu) {
+            gpus_checkbox.checked = true;
+        } 
+        gpus_checkbox.addEventListener('change', function() {
+            if(this.checked) {
+                models_data.model_assign[index].share_gpu = true;
+            } else {
+                models_data.model_assign[index].share_gpu = false;
+            }
+            save_model_assigned();
+        });
+        gpus_share.appendChild(gpus_checkbox);
         del_button.innerHTML = `<i class="bi bi-trash3-fill"></i>`;
         del_button.dataset.model = index;
         del_button.addEventListener('click', function() {
@@ -199,6 +225,7 @@ function render_models_assigned(models) {
         row.appendChild(model_name);
         row.appendChild(completion);
         row.appendChild(select_gpus);
+        row.appendChild(gpus_share);
         row.appendChild(del);
         models_table.appendChild(row);
     }