From d09e2605f007034257bec92409205dd2003e32a6 Mon Sep 17 00:00:00 2001 From: Dimitry Ageev Date: Fri, 18 Aug 2023 19:01:09 +0300 Subject: [PATCH] Share gpu (#75) * share gpu initial * list of model statuses for one device * share gpu + multiple statuses * gui-backend integration fixes * add estimated required memory * low memory alert * check if models data is ready * space fix * jumping status fix --------- Co-authored-by: oxyplay --- .../refact_known_models/huggingface.py | 8 ++ known_models_db/refact_known_models/refact.py | 2 + .../watchdog/docker_watchdog.py | 45 ++++---- .../watchdog/watchdog.d/model.cfg | 3 +- .../webgui/static/style.css | 7 +- .../webgui/static/tab-model-hosting.html | 6 +- .../webgui/static/tab-model-hosting.js | 71 ++++++++---- .../webgui/tab_models_host.py | 109 +++++++++++++----- 8 files changed, 176 insertions(+), 75 deletions(-) diff --git a/known_models_db/refact_known_models/huggingface.py b/known_models_db/refact_known_models/huggingface.py index 750f8c3e..e3d7f953 100644 --- a/known_models_db/refact_known_models/huggingface.py +++ b/known_models_db/refact_known_models/huggingface.py @@ -5,6 +5,7 @@ "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface", "chat_scratchpad_class": None, "model_class_kwargs": {}, + "required_memory_mb": 18000, "filter_caps": ["completion"], }, "starcoder/15b/plus": { @@ -13,6 +14,7 @@ "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface", "chat_scratchpad_class": None, "model_class_kwargs": {}, + "required_memory_mb": 18000, "filter_caps": ["completion"], }, "starchat/15b/beta": { @@ -21,6 +23,7 @@ "diff_scratchpad_class": None, "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceStarChat", "model_class_kwargs": {}, + "required_memory_mb": 18000, "filter_caps": ["starchat"], }, "wizardcoder/15b": { @@ -29,6 +32,7 @@ "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface", "chat_scratchpad_class": None, "model_class_kwargs": {}, + "required_memory_mb": 18000, "filter_caps": ["completion"], }, "wizardlm/7b": { @@ -39,6 +43,7 @@ "model_class_kwargs": { "model_basename": "wizardlm-7b-v1.0-uncensored-GPTQ-4bit-128g.no-act.order", }, + "required_memory_mb": 8000, "filter_caps": ["wizardlm"], }, "wizardlm/13b": { @@ -49,6 +54,7 @@ "model_class_kwargs": { "model_basename": "wizardlm-13b-v1.1-GPTQ-4bit-128g.no-act.order", }, + "required_memory_mb": 14000, "filter_caps": ["wizardlm"], }, "llama2/7b": { @@ -59,6 +65,7 @@ "model_class_kwargs": { "model_basename": "gptq_model-4bit-128g", }, + "required_memory_mb": 8000, "filter_caps": ["llama2"], }, "llama2/13b": { @@ -69,6 +76,7 @@ "model_class_kwargs": { "model_basename": "gptq_model-4bit-128g", }, + "required_memory_mb": 14000, "filter_caps": ["llama2"], }, "wizardlm/30b/4bit": { diff --git a/known_models_db/refact_known_models/refact.py b/known_models_db/refact_known_models/refact.py index dfa6bb78..32cbc7e3 100644 --- a/known_models_db/refact_known_models/refact.py +++ b/known_models_db/refact_known_models/refact.py @@ -6,6 +6,7 @@ "chat_scratchpad_class": None, "model_class": "refact_models:CodifyModel", "T": 2048, + "required_memory_mb": 3500, "filter_caps": ["CONTRASTcode", "completion"], }, @@ -16,6 +17,7 @@ "chat_scratchpad_class": None, "model_class": "refact_models:CodifyModel", "T": 2048, + "required_memory_mb": 8500, "filter_caps": ["CONTRASTcode", "completion", "finetune"], }, diff --git a/self_hosting_machinery/watchdog/docker_watchdog.py b/self_hosting_machinery/watchdog/docker_watchdog.py index acd353e8..f007240b 100644 --- a/self_hosting_machinery/watchdog/docker_watchdog.py +++ b/self_hosting_machinery/watchdog/docker_watchdog.py @@ -6,7 +6,7 @@ import sys import time import uuid -from typing import Dict, Optional +from typing import Dict, Optional, List from self_hosting_machinery import env @@ -188,7 +188,7 @@ def maybe_can_start(self): if can_start: self._start() elif "always_on_low_priority" in policy: - can_start = low_priority_can_start(self.cfg.get("gpus", [])) + can_start = low_priority_can_start(self) if can_start: self._start() elif "at_night" in policy: @@ -197,6 +197,15 @@ def maybe_can_start(self): if self.start_ts + self.cfg["restart_every"] < time.time(): self._start() + def __str__(self): + return f"TrackedJob:\n" \ + f" pid: {self.p.pid if self.p else None}\n" \ + f" cmd: '{self.cmdline_str}'\n" \ + f" start_ts: {self.start_ts}\n" \ + f" cfg: {self.cfg}\n" \ + f" shutdown: {self.please_shutdown}\n" \ + f" remove: {self.remove_this}\n" \ + f" status: {self.status_from_stderr}\n" tracked: Dict[str, TrackedJob] = {} @@ -244,11 +253,13 @@ def preempt_low_priority(gpus): return can_start -def low_priority_can_start(gpus): +def low_priority_can_start(job: TrackedJob): can_start = True - for job in tracked.values(): - if set(gpus) & set(job.cfg["gpus"]): - if job.p is not None: + for tracked_job in tracked.values(): + if job.cfg.get("share_gpu", False) and tracked_job.cfg.get("share_gpu", False): + continue + if set(job.cfg.get("gpus", [])) & set(tracked_job.cfg.get("gpus", [])): + if tracked_job.p is not None: can_start = False return can_start @@ -258,26 +269,20 @@ def low_priority_can_start(gpus): def inform_about_gpu_status(): global _inform_about_gpu_status - MAX = 16 - gpu_command = [""] * MAX - gpu_status = [""] * MAX + gpu_status: Dict[int, List[Dict]] = {} for job in tracked.values(): if job.p is None: continue - for gpu in job.cfg["gpus"]: - if gpu >= 0 and gpu < len(gpu_status): + for gpu in map(int, job.cfg["gpus"]): + if gpu >= 0: t = job.cmdline_str if t.startswith("python -m"): t = t[len("python -m"):] - gpu_command[gpu] = t.strip() - gpu_status[gpu] = job.status_from_stderr - j = {"gpus": [{}]*16} - for i in range(MAX): - j["gpus"][i] = { - "command": gpu_command[i], - "status": gpu_status[i], - } - s = json.dumps(j, indent=4) + "\n" + gpu_status.setdefault(gpu, []).append({ + "command": t.strip(), + "status": job.status_from_stderr, + }) + s = json.dumps({"gpus": gpu_status}, indent=4) + "\n" if s != _inform_about_gpu_status: with open(env.CONFIG_BUSY_GPUS + ".tmp", "w") as f: f.write(s) diff --git a/self_hosting_machinery/watchdog/watchdog.d/model.cfg b/self_hosting_machinery/watchdog/watchdog.d/model.cfg index 0661c819..cb80f274 100644 --- a/self_hosting_machinery/watchdog/watchdog.d/model.cfg +++ b/self_hosting_machinery/watchdog/watchdog.d/model.cfg @@ -3,5 +3,6 @@ "command_line": ["python", "-m", "self_hosting_machinery.inference.inference_worker"], "unfinished": true, "needs_compile": true, - "gpus": [] + "gpus": [], + "share_gpu": false } diff --git a/self_hosting_machinery/webgui/static/style.css b/self_hosting_machinery/webgui/static/style.css index fa9ec23a..bbe5812a 100644 --- a/self_hosting_machinery/webgui/static/style.css +++ b/self_hosting_machinery/webgui/static/style.css @@ -233,7 +233,7 @@ h3 { } .gpus-item { display: flex; - justify-content: center; + justify-content: flex-start; text-align: center; border: 1px solid rgba(0,0,0,0.15); border-radius: 8px; @@ -626,6 +626,11 @@ h3 { } .model-hosting-error { color: darkred; + text-align: right; +} +.model-memory-error { + color: darkred; + text-align: right; } .disabled-group { opacity: 0.4; diff --git a/self_hosting_machinery/webgui/static/tab-model-hosting.html b/self_hosting_machinery/webgui/static/tab-model-hosting.html index af42553e..fd8e925e 100644 --- a/self_hosting_machinery/webgui/static/tab-model-hosting.html +++ b/self_hosting_machinery/webgui/static/tab-model-hosting.html @@ -10,6 +10,7 @@

Hosted Models

Model Completion Sharding + Sharing @@ -19,7 +20,10 @@

Hosted Models

-
You have more models selected than GPUs available.
+
+
You have more models selected than GPUs available.
+
Required memory exceeds the GPU's memory.
+
diff --git a/self_hosting_machinery/webgui/static/tab-model-hosting.js b/self_hosting_machinery/webgui/static/tab-model-hosting.js index f9d66ca7..e7f0f654 100644 --- a/self_hosting_machinery/webgui/static/tab-model-hosting.js +++ b/self_hosting_machinery/webgui/static/tab-model-hosting.js @@ -34,37 +34,39 @@ function render_gpus(gpus) { gpu_mem.classList.add('gpus-mem'); const gpu_temp = document.createElement("div"); gpu_temp.classList.add('gpus-temp'); - const gpu_command = document.createElement("div"); - gpu_command.classList.add('gpus-command'); - const gpu_status = document.createElement("div"); - gpu_status.classList.add('gpus-status'); - const used_gb = format_memory(element.mem_used_mb); const total_gb = format_memory(element.mem_total_mb); const used_mem = Math.round(element.mem_used_mb / (element.mem_total_mb / 100)); gpu_name.innerHTML = element.name; gpu_mem.innerHTML = `Mem
${used_gb}/${total_gb} GB
`; gpu_temp.innerHTML = `Temp` + element.temp_celsius + '°C'; - gpu_command.innerHTML = `${element.status}`; - gpu_status.innerHTML += `
Command${element.command}
`; - gpu_status.innerHTML += `
Status${element.status}
`; - gpu_command.appendChild(gpu_status); - gpu_command.addEventListener('mouseover',function(e) { - gpus_popup = true; - this.querySelector('.gpus-status').classList.add('gpus-status-visible'); - }); - gpu_command.addEventListener('mouseout',function(e) { - gpus_popup = false; - this.querySelector('.gpus-status').classList.remove('gpus-status-visible'); - }); - if(!element.status || element.status === '') { - gpu_command.classList.add('gpus-status-invisible'); - } row.appendChild(gpu_image); gpu_wrapper.appendChild(gpu_name); gpu_wrapper.appendChild(gpu_mem); gpu_wrapper.appendChild(gpu_temp); - gpu_wrapper.appendChild(gpu_command); + element.statuses.forEach(status => { + const gpu_command = document.createElement("div"); + gpu_command.classList.add('gpus-command'); + const gpu_status = document.createElement("div"); + gpu_status.classList.add('gpus-status'); + gpu_command.innerHTML = `${status.status}`; + gpu_status.innerHTML += `
Command${status.command}
`; + gpu_status.innerHTML += `
Status${status.status}
`; + gpu_command.appendChild(gpu_status); + gpu_command.addEventListener('mouseover',function(e) { + gpus_popup = true; + this.querySelector('.gpus-status').classList.add('gpus-status-visible'); + }); + gpu_command.addEventListener('mouseout',function(e) { + gpus_popup = false; + this.querySelector('.gpus-status').classList.remove('gpus-status-visible'); + }); + if(!status.status || status.status === '') { + gpu_command.classList.add('gpus-status-invisible'); + } + gpu_wrapper.appendChild(gpu_command); + }); + row.appendChild(gpu_wrapper); gpus_list.appendChild(row); }); @@ -84,11 +86,17 @@ function get_models() enable_chat_gpt_switch.checked = models_data['openai_api_enable']; enable_chat_gpt_switch.addEventListener('change', save_model_assigned); const more_gpus_notification = document.querySelector('.model-hosting-error'); - if(models_data.more_models_than_gpus) { + if(models_data && models_data.length > 0 && models_data.more_models_than_gpus) { more_gpus_notification.classList.remove('d-none'); } else { more_gpus_notification.classList.add('d-none'); } + const required_memory_exceed_available = document.querySelector('.model-memory-error'); + if(models_data && models_data.length > 0 && models_data.required_memory_exceed_available) { + required_memory_exceed_available.classList.remove('d-none'); + } else { + required_memory_exceed_available.classList.add('d-none'); + } }); } @@ -187,6 +195,24 @@ function render_models_assigned(models) { // // models_gpus_change = true; // }); gpus.appendChild(gpus_input); + const gpus_share = document.createElement("td"); + const gpus_checkbox = document.createElement("input"); + gpus_checkbox.setAttribute('type','checkbox'); + gpus_checkbox.setAttribute('value',index); + gpus_checkbox.setAttribute('name',`share-${index}`); + gpus_checkbox.classList.add('form-check-input'); + if(models_data.model_assign[index].share_gpu) { + gpus_checkbox.checked = true; + } + gpus_checkbox.addEventListener('change', function() { + if(this.checked) { + models_data.model_assign[index].share_gpu = true; + } else { + models_data.model_assign[index].share_gpu = false; + } + save_model_assigned(); + }); + gpus_share.appendChild(gpus_checkbox); del_button.innerHTML = ``; del_button.dataset.model = index; del_button.addEventListener('click', function() { @@ -199,6 +225,7 @@ function render_models_assigned(models) { row.appendChild(model_name); row.appendChild(completion); row.appendChild(select_gpus); + row.appendChild(gpus_share); row.appendChild(del); models_table.appendChild(row); } diff --git a/self_hosting_machinery/webgui/tab_models_host.py b/self_hosting_machinery/webgui/tab_models_host.py index b38cc671..a0e0a803 100644 --- a/self_hosting_machinery/webgui/tab_models_host.py +++ b/self_hosting_machinery/webgui/tab_models_host.py @@ -9,11 +9,12 @@ from known_models_db.refact_known_models import models_mini_db from self_hosting_machinery import env - from known_models_db.refact_toolbox_db import modelcap_records +from dataclasses import dataclass +from dataclasses import field from pydantic import BaseModel -from typing import Dict, Set +from typing import Dict, Set, List __all__ = ["TabHostRouter"] @@ -21,6 +22,7 @@ class TabHostModelRec(BaseModel): gpus_shard: int = Query(default=1, ge=1, le=4) + share_gpu: bool = False class TabHostModelsAssign(BaseModel): @@ -58,16 +60,23 @@ async def _tab_host_models_assign(self, post: TabHostModelsAssign, request: Requ def _gpus(include_busy: bool = False): if os.path.exists(env.CONFIG_ENUM_GPUS): - j1 = json.load(open(env.CONFIG_ENUM_GPUS, "r")) + result = json.load(open(env.CONFIG_ENUM_GPUS, "r")) else: - j1 = {"gpus": []} + result = {"gpus": []} if include_busy and os.path.exists(env.CONFIG_BUSY_GPUS): - j2 = json.load(open(env.CONFIG_BUSY_GPUS, "r")) - j1len = len(j1["gpus"]) - j2len = len(j2["gpus"]) - for i in range(min(j1len, j2len)): - j1["gpus"][i].update(j2["gpus"][i]) - return j1 + statuses = json.load(open(env.CONFIG_BUSY_GPUS, "r")) + if isinstance(statuses["gpus"], list): # convert old format to new + statuses["gpus"] = { + idx: [status] + for idx, status in enumerate(statuses["gpus"]) + if status + } + statuses["gpus"] = { + int(k): v for k, v in statuses["gpus"].items() + } + for idx, gpu_info in enumerate(result["gpus"]): + gpu_info["statuses"] = statuses["gpus"].get(idx, []) + return result def _model_assignment(): @@ -104,37 +113,75 @@ def _capabilities(func_type: str) -> Set: return {"models": models_info} +@dataclass +class ModelGroup: + model_assign: Dict[str, Dict] = field(default_factory=dict) + + def required_memory_mb(self) -> int: + return sum( + models_mini_db[model_name].get("required_memory_mb", 0) + for model_name in self.model_assign.keys() + ) + + def gpus_shard(self) -> int: + if not self.model_assign: + return 0 + return max([rec["gpus_shard"] for rec in self.model_assign.values()]) + + +def _model_assign_to_groups(model_assign: Dict[str, Dict]) -> List[ModelGroup]: + model_groups: List[ModelGroup] = [] + shared_group = ModelGroup() + for model_name, assignment in model_assign.items(): + if model_name not in models_mini_db.keys(): + log(f"unknown model '{model_name}', skipping") + continue + if assignment["gpus_shard"] not in [1, 2, 4]: + log(f"invalid shard count {assignment['gpus_shard']}, skipping '{model_name}'") + continue + if assignment.get("share_gpu", False): + if not shared_group.model_assign: + model_groups.append(shared_group) + shared_group.model_assign[model_name] = assignment + else: + model_groups.append(ModelGroup({model_name: assignment})) + return model_groups + + def models_to_watchdog_configs(inference_config=None): if inference_config is None: inference_config = _model_assignment() gpus = _gpus()["gpus"] - model_assignment = inference_config["model_assign"] + model_groups = _model_assign_to_groups(inference_config["model_assign"]) # This must work or installation is bad model_cfg_template = json.load(open(os.path.join(env.DIR_WATCHDOG_TEMPLATES, "model.cfg"))) cursor = 0 allowed_to_exist = [] + required_memory_exceed_available = False more_models_than_gpus = False - for k, assrec in model_assignment.items(): - if k not in models_mini_db.keys(): - log("unknown model '%s', skipping" % k) - continue - if assrec["gpus_shard"] not in [1, 2, 4]: - log("invalid shard count %d, skipping %s" % (assrec["gpus_shard"], k)) - continue - log("assign model '%s', cursor %d, gpus_shard %d" % (k, cursor, assrec["gpus_shard"])) - if cursor + assrec["gpus_shard"] > len(gpus): + for model_group in model_groups: + models_message = ' '.join([f"'{model_name}'" for model_name in model_group.model_assign.keys()]) + log(f"assign models {models_message}, cursor {cursor}, gpus_shard {model_group.gpus_shard()}") + if cursor + model_group.gpus_shard() > len(gpus): more_models_than_gpus = True break - cfg_out = "model-%s.cfg" % k.lower().replace("/", "-") - allowed_to_exist.append(cfg_out) - with open(os.path.join(env.DIR_WATCHDOG_D, cfg_out), "w") as f: - model_cfg_j = copy.deepcopy(model_cfg_template) - model_cfg_j["command_line"].append("--model") - model_cfg_j["command_line"].append(k) - model_cfg_j["gpus"] = list(range(cursor, cursor + assrec["gpus_shard"])) - del model_cfg_j["unfinished"] - json.dump(model_cfg_j, f, indent=4) - cursor += assrec["gpus_shard"] + for model_name, assignment in model_group.model_assign.items(): + for idx, model_cursor in enumerate(range(cursor, cursor + assignment["gpus_shard"])): + cfg_out = f"model-{model_name.lower().replace('/', '-')}-{idx}.cfg" + allowed_to_exist.append(cfg_out) + with open(os.path.join(env.DIR_WATCHDOG_D, cfg_out), "w") as f: + model_cfg_j = copy.deepcopy(model_cfg_template) + model_cfg_j["command_line"].append("--model") + model_cfg_j["command_line"].append(model_name) + model_cfg_j["gpus"] = list(range(model_cursor, model_cursor + assignment["gpus_shard"])) + model_cfg_j["share_gpu"] = assignment.get("share_gpu", False) + del model_cfg_j["unfinished"] + json.dump(model_cfg_j, f, indent=4) + for _ in range(model_group.gpus_shard()): + if gpus[cursor]["mem_total_mb"] < model_group.required_memory_mb(): + required_memory_exceed_available = True + cursor += 1 + log("required_memory_exceed_available %d" % required_memory_exceed_available) log("more_models_than_gpus %d" % more_models_than_gpus) cfgs_on_disk = [cfg for cfg in os.listdir(env.DIR_WATCHDOG_D) if cfg.endswith(".cfg") and cfg.startswith("model-")] for cfg_fn in cfgs_on_disk: @@ -168,6 +215,7 @@ def models_to_watchdog_configs(inference_config=None): with open(env.CONFIG_INFERENCE, "w") as f: json.dump({ + "required_memory_exceed_available": required_memory_exceed_available, "more_models_than_gpus": more_models_than_gpus, **inference_config, }, f, indent=4) @@ -178,6 +226,7 @@ def first_run(): "model_assign": { "CONTRASTcode/3b/multi": { 'gpus_shard': 1, + 'share_gpu': False, } }, "completion": "CONTRASTcode/3b/multi",