Finetune and models fixes (#85)

* model finetune activate new format * typos, msgs, panel block * lora mode off by default, best lora script args * add missed requirements * fixed refact1.6 chat weird behaviour * add system prompt to somehow handle out-of-distribution answers * added top3-like logs (no top3 tokens though) * add missing import * 1.6b chat prompt, strip() in messages * ScratchpadHuggingface -> ScratchpadFIM, remove copy-paste * fim spad refactoring * tokenizer for one token refactoring * temporary disable finetune property for refact model * ignoring the `system prompt` in some cases for refact1.6b model * ignoring the `system prompt` in some cases for refact1.6b model * fix required modules --------- Co-authored-by: oxyplay <max@oxyplay.com> Co-authored-by: JegernOUTT <sergey.vakhreev@gmail.com> Co-authored-by: Oleg Klimov <omgtech@gmail.com>
smallcloudai · Sep 2, 2023 · 7431a01 · 7431a01
1 parent 228cb97
commit 7431a01
Show file tree

Hide file tree

Showing 14 changed files with 130 additions and 139 deletions.
diff --git a/known_models_db/refact_known_models/huggingface.py b/known_models_db/refact_known_models/huggingface.py
@@ -2,7 +2,7 @@
     "starcoder/15b/base": {
         "backend": "autogptq",
         "model_path": "TheBloke/starcoder-GPTQ",
-        "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
+        "diff_scratchpad_class": "refact_scratchpads:ScratchpadPSM",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
         "required_memory_mb": 18000,
@@ -11,7 +11,7 @@
     "starcoder/15b/plus": {
         "backend": "autogptq",
         "model_path": "TheBloke/starcoderplus-GPTQ",
-        "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
+        "diff_scratchpad_class": "refact_scratchpads:ScratchpadPSM",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
         "required_memory_mb": 18000,
@@ -29,7 +29,7 @@
     "wizardcoder/15b": {
         "backend": "autogptq",
         "model_path": "TheBloke/WizardCoder-15B-1.0-GPTQ",
-        "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
+        "diff_scratchpad_class": "refact_scratchpads:ScratchpadPSM",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
         "required_memory_mb": 18000,

diff --git a/known_models_db/refact_known_models/refact.py b/known_models_db/refact_known_models/refact.py
@@ -2,11 +2,11 @@
     "Refact/1.6B": {
         "backend": "transformers",
         "model_path": "smallcloudai/Refact-1_6B-fim",
-        "diff_scratchpad_class": "refact_scratchpads:ScratchpadHuggingface",
+        "diff_scratchpad_class": "refact_scratchpads:ScratchpadSPM",
         "chat_scratchpad_class": "refact_scratchpads:ScratchpadHuggingfaceRefact",
         "model_class_kwargs": {},
         "required_memory_mb": 6000,
-        "filter_caps": ["Refact", "completion", "finetune"],
+        "filter_caps": ["Refact", "completion"],
     },
 
     "CONTRASTcode/medium/multi": {

diff --git a/refact_scratchpads/__init__.py b/refact_scratchpads/__init__.py
@@ -4,7 +4,8 @@
 
 from refact_scratchpads.scratchpad_hf import ScratchpadHuggingfaceBase
 from refact_scratchpads.scratchpad_hf import ScratchpadHuggingfaceCompletion
-from refact_scratchpads.scratchpad_hf import ScratchpadHuggingface
+from refact_scratchpads.scratchpad_hf import ScratchpadSPM
+from refact_scratchpads.scratchpad_hf import ScratchpadPSM
 from refact_scratchpads.scratchpad_hf import ScratchpadCodeLlama
 from refact_scratchpads.scratchpad_hf import ScratchpadHuggingfaceStarChat
 from refact_scratchpads.scratchpad_hf import ScratchpadHuggingfaceWizard

diff --git a/refact_scratchpads/scratchpad_hf.py b/refact_scratchpads/scratchpad_hf.py
@@ -1,7 +1,5 @@
 import torch as th
 import time
-import json
-import os
 import termcolor
 
 from refact_scratchpads.scratchpad_utils import trim_context_infill
@@ -33,7 +31,6 @@ def __init__(
         **unused
     ):
         self._tokenizer = tokenizer
-        self._tokenizer_skip_first = bool(tokenizer.encode(""))    # XXX: replace with add_special_tokens=False ?
         self._max_tokens = max_tokens
         self._logger = logger
         self._created = created
@@ -96,14 +93,12 @@ def after_token_selection(self, m, chosen_token: th.Tensor, **unused) -> Dict[st
         return dict()
 
     def _encode_one_token(self, text: str) -> int:
-        tokens = self._tokenizer.encode(text)
-        if self._tokenizer_skip_first:
-            tokens = tokens[1:]
+        tokens = self._tokenizer.encode(text, add_special_tokens=False)
         if len(tokens) != 1:
             raise ValueError(f"Must be single token, have {tokens} for '{text}'")
         return tokens[0]
 
-    def encode_without_special_tokens(self, txt: str) -> List[int]:
+    def _encode_without_special_tokens(self, txt: str) -> List[int]:
         if hasattr(self._tokenizer, "tokenizer_copy_but_does_not_encode_special_tokens"):
             t = self._tokenizer.tokenizer_copy_but_does_not_encode_special_tokens
         else:
@@ -141,15 +136,14 @@ def completion(self, final: bool):
         return {"text": self._tokenizer.decode(self._completion)}
 
 
-class ScratchpadHuggingface(ScratchpadHuggingfaceBase):
+class ScratchpadFIM(ScratchpadHuggingfaceBase):
 
     def __init__(
             self,
             sources: Dict[str, str],
             cursor_file: str,
             cursor0: int,
             cursor1: int,
-            ignore_special_tokens: bool = True,
             **kwargs
     ):
         super().__init__(**kwargs)
@@ -158,7 +152,6 @@ def __init__(
 
         self._cursor_file = cursor_file
         self._cursor = cursor0
-        self._ignore_special_tokens = ignore_special_tokens
         self._code = sources[cursor_file]
 
         self._prefix: Optional[str] = None
@@ -171,6 +164,9 @@ def __init__(
         self._fim_suffix = self._encode_one_token("<fim_suffix>")
         self._fim_middle = self._encode_one_token("<fim_middle>")
 
+    def _prompt_format(self, prefix_tokens, suffix_tokens):
+        raise NotImplementedError()
+
     def prompt(self, T: int):
         self._prefix = self._code[:self._cursor]
         # Why we need to cut the line right of the cursor?
@@ -182,34 +178,23 @@ def prompt(self, T: int):
         #                                        ^^ but we stop here because we need single line completion
         # => we have two closing parenthesis.
         # self._suffix = "".join(self._code[self._cursor:].splitlines(keepends=True)[1:])
-        self._suffix = self._code[self._cursor:]
+        self._suffix = self._code[self._cursor:].lstrip(" \t")
         self._suffix_line0cut = "".join(self._code[self._cursor:].splitlines(keepends=True)[1:])
         self._completion.clear()
 
         prefix_cut, suffix_cut = trim_context_infill(
             self._prefix, self._suffix, EncodingWrapper(self._tokenizer), T - self._max_tokens
         )
+        prefix_cut_tokens = self._encode_without_special_tokens(prefix_cut)
+        suffix_cut_tokens = self._encode_without_special_tokens(suffix_cut)
         self.debuglog(
-            f"ScratchpadHuggingfaceFIM prompt prefix {len(prefix_cut)} chars, "
-            f"suffix {len(suffix_cut)} chars, T={T} max_tokens={self._max_tokens}"
+            "ScratchpadFIM prompt prefix %d chars -> %d tokens, suffix %d chars -> %d tokens, T=%d max_new_tokens=%d" %
+            (len(prefix_cut), len(prefix_cut_tokens), len(suffix_cut), len(suffix_cut_tokens), T, self._max_tokens)
         )
-        if self._ignore_special_tokens:
-            prefix_cut_tokens = self.encode_without_special_tokens(prefix_cut)
-            suffix_cut_tokens = self.encode_without_special_tokens(suffix_cut)
-        else:
-            prefix_cut_tokens = self._tokenizer.encode(prefix_cut)
-            suffix_cut_tokens = self._tokenizer.encode(suffix_cut)
-
-        prompt: List[int] = [
-            self._fim_prefix,
-            *prefix_cut_tokens,
-            self._fim_suffix,
-            *suffix_cut_tokens,
-            self._fim_middle,
-        ]
-        # self.debuglog("-"*40)
-        # self.debuglog(self._tokenizer.decode(prompt))
-        # self.debuglog("-"*40)
+        prompt: List[int] = self._prompt_format(prefix_cut_tokens, suffix_cut_tokens)
+        self.debuglog("-"*40)
+        self.debuglog(self._tokenizer.decode(prompt))
+        self.debuglog("-"*40)
         return prompt
 
     def completion(self, final: bool):
@@ -224,88 +209,28 @@ def completion(self, final: bool):
             return {self._cursor_file: self._prefix + completion + self._suffix_line0cut}
 
 
-class ScratchpadRefactFIM(ScratchpadHuggingfaceBase):
-
-    def __init__(
-            self,
-            sources: Dict[str, str],
-            cursor_file: str,
-            cursor0: int,
-            cursor1: int,
-            ignore_special_tokens: bool = True,
-            **kwargs
-    ):
-        super().__init__(**kwargs)
-
-        assert cursor0 == cursor1
-
-        self._cursor_file = cursor_file
-        self._cursor = cursor0
-        self._ignore_special_tokens = ignore_special_tokens
-        self._code = sources[cursor_file]
-
-        self._prefix: Optional[str] = None
-        self._suffix: Optional[str] = None
-        self._suffix_line0cut: Optional[str] = None
-        self._completion = []
-
-        self._tokens_produced = 0
-        self._fim_prefix = self._encode_one_token("<fim_prefix>")
-        self._fim_suffix = self._encode_one_token("<fim_suffix>")
-        self._fim_middle = self._encode_one_token("<fim_middle>")
-
-    def prompt(self, T: int):
-        self._prefix = self._code[:self._cursor]
-        # Why we need to cut the line right of the cursor?
-        # Example 1:
-        # function_call(param1, GENERATED_TONENS<EOF>)
-        # => everything works right
-        # Example 2:
-        # function_call(param1, GENERATED_TONENS)\nMORE_TOKENS\nSOME_OTHER_CALL(OTHER_PARAM<EOF>)
-        #                                        ^^ but we stop here because we need single line completion
-        # => we have two closing parenthesis.
-        # self._suffix = "".join(self._code[self._cursor:].splitlines(keepends=True)[1:])
-        self._suffix = self._code[self._cursor:]
-        self._suffix_line0cut = "".join(self._code[self._cursor:].splitlines(keepends=True)[1:])
-        self._completion.clear()
-
-        prefix_cut, suffix_cut = trim_context_infill(
-            self._prefix, self._suffix, EncodingWrapper(self._tokenizer), T - self._max_tokens
-        )
-        self.debuglog(
-            f"ScratchpadRefactFIM prompt prefix {len(prefix_cut)} chars, "
-            f"suffix {len(suffix_cut)} chars, T={T} max_tokens={self._max_tokens}"
-        )
-        if self._ignore_special_tokens:
-            prefix_cut_tokens = self.encode_without_special_tokens(prefix_cut)
-            suffix_cut_tokens = self.encode_without_special_tokens(suffix_cut)
-        else:
-            prefix_cut_tokens = self._tokenizer.encode(prefix_cut)
-            suffix_cut_tokens = self._tokenizer.encode(suffix_cut)
+class ScratchpadSPM(ScratchpadFIM):
 
-        prompt: List[int] = [
+    def _prompt_format(self, prefix_tokens, suffix_tokens):
+        return [
             self._fim_suffix,
-            *suffix_cut_tokens,
+            *suffix_tokens,
             self._fim_prefix,
-            *prefix_cut_tokens,
+            *prefix_tokens,
             self._fim_middle,
         ]
-        # self.debuglog("-"*40)
-        # self.debuglog(self._tokenizer.decode(prompt))
-        # self.debuglog("-"*40)
-        return prompt
 
-    def completion(self, final: bool):
-        assert self._prefix is not None
-        assert self._suffix is not None
-        completion = self._tokenizer.decode(self._completion)
-        if self.finish_reason == "eot":
-            # Correct stop
-            return {self._cursor_file: self._prefix + completion + self._suffix}
-        else:
-            # "stop-lf" or "length" or not stopped yet (empty reason), it's better to remove first line remainder
-            return {self._cursor_file: self._prefix + completion + self._suffix_line0cut}
 
+class ScratchpadPSM(ScratchpadFIM):
+
+    def _prompt_format(self, prefix_tokens, suffix_tokens):
+        return [
+            self._fim_prefix,
+            *prefix_tokens,
+            self._fim_suffix,
+            *suffix_tokens,
+            self._fim_middle,
+        ]
 
 
 class ScratchpadCodeLlama(ScratchpadHuggingfaceBase):
@@ -454,14 +379,21 @@ def __init__(self, *args, **kwargs):
         self._esc = "<empty_output>"
 
     def _prompt(self) -> str:
-        text = ""
+        if len(self._messages) <= 2:
+            text = self._esc + ("SYSTEM You are a programming assistant. "
+                                "If you don't understand the question, just say: "
+                                "I don't understand the question.\n")
+        else:
+            # We are ignoring the `system prompt` here 'cause the model
+            # haven't seen more than two messages with a `system prompt` while training
+            # Going to fix this later with the next iteration
+            text = ""
         for message in self._messages:
             if message["content"] == "":
                 continue
             if message["role"] == "user":
-                text += f"{self._esc}USER "
+                text += self._esc + "USER " + message["content"].strip() + "\n"
             else:
-                text += f"{self._esc}ASSISTANT "
-            text += message["content"] + "\n"
-        text += f"{self._esc}ASSISTANT "
+                text += self._esc + "ASSISTANT " + message["content"].strip() + "\n"
+        text += self._esc + "ASSISTANT"
         return text
diff --git a/refact_scratchpads_no_gpu/stream_results.py b/refact_scratchpads_no_gpu/stream_results.py
@@ -1,7 +1,18 @@
-import os, sys, json, re, time, datetime, termcolor, multiprocessing, copy, queue
+import os
+import json
+import re
+import time
+import datetime
+import termcolor
+import multiprocessing
+import copy
+import queue
 import requests
-from typing import Dict, Any, List, Optional, Set
+import setproctitle
 import logging
+
+from typing import Dict, Any, List, Optional, Set
+
 logger = logging.getLogger("INFSERVER")
 
 
@@ -241,7 +252,6 @@ def check_cancelled(self):
 
 
 def _upload_results_loop(upload_q: multiprocessing.Queue, cancelled_q: multiprocessing.Queue):
-    import setproctitle
     setproctitle.setproctitle("upload_results_loop")
     req_session = infserver_session()
     exit_flag = False

diff --git a/self_hosting_machinery/inference/inference_base.py b/self_hosting_machinery/inference/inference_base.py
@@ -9,6 +9,8 @@ def modload(import_str):
     import_mod, import_class = import_str.rsplit(":", 1)
     model = importlib.import_module(import_mod)
     Class = getattr(model, import_class, None)
+    if Class is None:
+        raise ValueError("cannot find \"%s\"" % import_str)
     return Class