From 79e152dad29770fcde2d61402ab6d3de417cedf8 Mon Sep 17 00:00:00 2001
From: Oleg Klimov <omgtech@gmail.com>
Date: Thu, 31 Aug 2023 12:10:48 +0200
Subject: [PATCH] Add 1.6b, add fix some other models  (#82)

* update known models list

* add required model memory and auto torch dtype for huggingface models

* refmove system prompt from refact chat scratchpad

* Refact/1.6B-fim

* fix model regex

* Refact/1.6B name

* context size

---------

Co-authored-by: mitya <dimitry.ageev@gmail.com>
---
 .../refact_known_models/huggingface.py        | 16 ++++-----------
 known_models_db/refact_known_models/refact.py | 20 +++++++++----------
 refact_scratchpads/scratchpad_hf.py           | 12 +++++------
 .../inference/inference_hf.py                 | 10 +++++++---
 4 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/known_models_db/refact_known_models/huggingface.py b/known_models_db/refact_known_models/huggingface.py
index 1bbd6912..3fa5e90e 100644
--- a/known_models_db/refact_known_models/huggingface.py
+++ b/known_models_db/refact_known_models/huggingface.py
@@ -40,9 +40,7 @@
         "model_path": "TheBloke/WizardLM-7B-V1.0-Uncensored-GPTQ",
         "diff_scratchpad_class": None,
         "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceWizard",
-        "model_class_kwargs": {
-            "model_basename": "wizardlm-7b-v1.0-uncensored-GPTQ-4bit-128g.no-act.order",
-        },
+        "model_class_kwargs": {},
         "required_memory_mb": 8000,
         "filter_caps": ["wizardlm"],
     },
@@ -51,9 +49,7 @@
         "model_path": "TheBloke/WizardLM-13B-V1.1-GPTQ",
         "diff_scratchpad_class": None,
         "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceWizard",
-        "model_class_kwargs": {
-            "model_basename": "wizardlm-13b-v1.1-GPTQ-4bit-128g.no-act.order",
-        },
+        "model_class_kwargs": {},
         "required_memory_mb": 14000,
         "filter_caps": ["wizardlm"],
     },
@@ -62,9 +58,7 @@
         "model_path": "TheBloke/Llama-2-7b-Chat-GPTQ",
         "diff_scratchpad_class": None,
         "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceLlama2",
-        "model_class_kwargs": {
-            "model_basename": "gptq_model-4bit-128g",
-        },
+        "model_class_kwargs": {},
         "required_memory_mb": 8000,
         "filter_caps": ["llama2"],
     },
@@ -73,9 +67,7 @@
         "model_path": "TheBloke/Llama-2-13B-chat-GPTQ",
         "diff_scratchpad_class": None,
         "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceLlama2",
-        "model_class_kwargs": {
-            "model_basename": "gptq_model-4bit-128g",
-        },
+        "model_class_kwargs": {},
         "required_memory_mb": 14000,
         "filter_caps": ["llama2"],
     },
diff --git a/known_models_db/refact_known_models/refact.py b/known_models_db/refact_known_models/refact.py
index 32cbc7e3..35bd2a12 100644
--- a/known_models_db/refact_known_models/refact.py
+++ b/known_models_db/refact_known_models/refact.py
@@ -1,4 +1,14 @@
 refact_mini_db = {
+    "Refact/1.6B": {
+        "backend": "transformers",
+        "model_path": "smallcloudai/Refact-1_6B-fim",
+        "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
+        "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceRefact",
+        "model_class_kwargs": {},
+        "required_memory_mb": 6000,
+        "filter_caps": ["Refact", "completion"],
+    },
+
     "CONTRASTcode/medium/multi": {
         "model_path_type": "huggingface",
         "model_path": "smallcloudai/codify_medium_multi",
@@ -20,14 +30,4 @@
         "required_memory_mb": 8500,
         "filter_caps": ["CONTRASTcode", "completion", "finetune"],
     },
-
-    "Refact/2b": {
-        "backend": "transformers",
-        "model_path": "smallcloudai/Refact-2b",
-        "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
-        "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceRefact",
-        "model_class_kwargs": {},
-        "filter_caps": ["Refact", "completion"],
-        "hidden": True,   # only for debugging because model is still training
-    },
 }
diff --git a/refact_scratchpads/scratchpad_hf.py b/refact_scratchpads/scratchpad_hf.py
index 085660e2..470565b4 100644
--- a/refact_scratchpads/scratchpad_hf.py
+++ b/refact_scratchpads/scratchpad_hf.py
@@ -451,19 +451,17 @@ class ScratchpadHuggingfaceRefact(ScratchpadChatBase):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self._esc_token = self._encode_one_token("<empty_output>")
+        self._esc = "<empty_output>"
 
     def _prompt(self) -> str:
-        esc = self._tokenizer.decode(self._esc_token)
-        system_prompt = "You are a chat bot"
-        text = f"{esc}SYSTEM {system_prompt}\n"
+        text = ""
         for message in self._messages:
             if message["content"] == "":
                 continue
             if message["role"] == "user":
-                text += f"{esc}USER "
+                text += f"{self._esc}USER "
             else:
-                text += f"{esc}ASSISTANT "
+                text += f"{self._esc}ASSISTANT "
             text += message["content"] + "\n"
-        text += f"{esc}ASSISTANT "
+        text += f"{self._esc}ASSISTANT "
         return text
diff --git a/self_hosting_machinery/inference/inference_hf.py b/self_hosting_machinery/inference/inference_hf.py
index 25502709..d11ffca2 100644
--- a/self_hosting_machinery/inference/inference_hf.py
+++ b/self_hosting_machinery/inference/inference_hf.py
@@ -128,8 +128,9 @@ def __init__(self,
 
         if model_dict["backend"] == "transformers":
             self._model = AutoModelForCausalLM.from_pretrained(
-                self._model_dict["model_path"], cache_dir=env.DIR_WEIGHTS, device_map="auto",
-                trust_remote_code=True, **self._model_dict["model_class_kwargs"])
+                self._model_dict["model_path"], cache_dir=env.DIR_WEIGHTS,
+                device_map="auto", torch_dtype="auto", trust_remote_code=True,
+                **self._model_dict["model_class_kwargs"])
         elif model_dict["backend"] == "autogptq":
             self._model = CustomAutoGPTQForCausalLM.from_quantized(
                 self._model_dict["model_path"], cache_dir=env.DIR_WEIGHTS, device=self._device,
@@ -154,7 +155,10 @@ def logger(*args):
             Scratchpad = ScratchpadHuggingfaceCompletion
 
         scratchpad = Scratchpad(tokenizer=self._tokenizer, logger=logger, **request)
-        p = scratchpad.prompt(self._tokenizer.max_len_single_sentence)
+        T = self._tokenizer.max_len_single_sentence
+        if not isinstance(T, int) or T <= 0 or T > 4096:
+            T = 2048
+        p = scratchpad.prompt(T)
         if len(p) == 0:
             raise RuntimeError("empty tokens prompt")