clp-research · phisad · Apr 25, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -192,3 +192,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 /venv_hf/
+/venv_llamacpp/
diff --git a/backends/llamacpp_api.py b/backends/llamacpp_api.py
@@ -0,0 +1,198 @@
+"""
+    Backend using llama.cpp for GGUF/GGML models.
+"""
+
+from typing import List, Dict, Tuple, Any
+
+import backends
+from backends.utils import check_context_limit_generic
+
+import llama_cpp
+from llama_cpp import Llama
+
+logger = backends.get_logger(__name__)
+
+
+def load_model(model_spec: backends.ModelSpec) -> Any:
+    """
+    Load GGUF/GGML model weights from HuggingFace, into VRAM if available. Weights are distributed over all available
+    GPUs for maximum speed - make sure to limit the available GPUs using environment variables if only a subset is to be
+    used.
+    :param model_spec: The ModelSpec for the model.
+    :return: The llama_cpp model class instance of the loaded model.
+    """
+    logger.info(f'Start loading llama.cpp model weights from HuggingFace: {model_spec.model_name}')
+
+    hf_repo_id = model_spec['huggingface_id']
+    hf_model_file = model_spec['filename']
+
+    # default to GPU offload:
+    gpu_layers_offloaded = -1  # -1 = offload all model layers to GPU
+    # check for optional execute_on flag:
+    if hasattr(model_spec, 'execute_on'):
+        if model_spec.execute_on == "gpu":
+            gpu_layers_offloaded = -1
+        elif model_spec.execute_on == "cpu":
+            gpu_layers_offloaded = 0
+    # check for optional gpu_layers_offloaded value:
+    elif hasattr(model_spec, 'gpu_layers_offloaded'):
+        gpu_layers_offloaded = model_spec.gpu_layers_offloaded
+
+    if 'requires_api_key' in model_spec and model_spec['requires_api_key']:
+        # load HF API key:
+        creds = backends.load_credentials("huggingface")
+        api_key = creds["huggingface"]["api_key"]
+        model = Llama.from_pretrained(hf_repo_id, hf_model_file, token=api_key, verbose=False,
+                                      n_gpu_layers=gpu_layers_offloaded, n_ctx=0)
+    else:
+        model = Llama.from_pretrained(hf_repo_id, hf_model_file, verbose=False, n_gpu_layers=gpu_layers_offloaded,
+                                      n_ctx=0)
+
+    logger.info(f"Finished loading llama.cpp model: {model_spec.model_name}")
+
+    return model
+
+
+def get_chat_formatter(model: Llama, model_spec: backends.ModelSpec) -> llama_cpp.llama_chat_format.Jinja2ChatFormatter:
+    # placeholders for BOS/EOS:
+    bos_string = None
+    eos_string = None
+
+    # check chat template:
+    if model_spec.premade_chat_template:
+        # jinja chat template available in metadata
+        chat_template = model.metadata['tokenizer.chat_template']
+    else:
+        chat_template = model_spec.custom_chat_template
+
+    if hasattr(model, 'chat_format'):
+        if not model.chat_format:
+            # no guessed chat format
+            pass
+        else:
+            if model.chat_format == "chatml":
+                # get BOS/EOS strings for chatml from llama.cpp:
+                bos_string = llama_cpp.llama_chat_format.CHATML_BOS_TOKEN
+                eos_string = llama_cpp.llama_chat_format.CHATML_EOS_TOKEN
+            elif model.chat_format == "mistral-instruct":
+                # get BOS/EOS strings for mistral-instruct from llama.cpp:
+                bos_string = llama_cpp.llama_chat_format.MISTRAL_INSTRUCT_BOS_TOKEN
+                eos_string = llama_cpp.llama_chat_format.MISTRAL_INSTRUCT_EOS_TOKEN
+
+    # get BOS/EOS token string from model file:
+    # NOTE: These may not be the expected tokens, checking these when model is added is likely necessary!
+    if "tokenizer.ggml.bos_token_id" in model.metadata:
+        bos_string = model._model.token_get_text(int(model.metadata.get("tokenizer.ggml.bos_token_id")))
+    if "tokenizer.ggml.eos_token_id" in model.metadata:
+        eos_string = model._model.token_get_text(int(model.metadata.get("tokenizer.ggml.eos_token_id")))
+
+    # get BOS/EOS strings for template from registry if not available from model file:
+    if not bos_string:
+        bos_string = model_spec.bos_string
+    if not eos_string:
+        eos_string = model_spec.eos_string
+
+    # init llama-cpp-python jinja chat formatter:
+    chat_formatter = llama_cpp.llama_chat_format.Jinja2ChatFormatter(
+        template=chat_template,
+        bos_token=bos_string,
+        eos_token=eos_string
+    )
+
+    return chat_formatter
+
+
+class LlamaCPPLocal(backends.Backend):
+    """
+    Model/backend handler class for locally-run GGUF/GGML models.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def get_model_for(self, model_spec: backends.ModelSpec) -> backends.Model:
+        """
+        Get a LlamaCPPLocalModel instance with the passed model and settings. Will load all required data for using
+        the model upon initialization.
+        :param model_spec: The ModelSpec for the model.
+        :return: The Model class instance of the model.
+        """
+        return LlamaCPPLocalModel(model_spec)
+
+
+class LlamaCPPLocalModel(backends.Model):
+    """
+    Class for loaded llama.cpp models ready for generation.
+    """
+    def __init__(self, model_spec: backends.ModelSpec):
+        super().__init__(model_spec)
+        self.model = load_model(model_spec)
+
+        self.chat_formatter = get_chat_formatter(self.model, model_spec)
+
+        if hasattr(self.model, 'chat_handler'):
+            if not self.model.chat_handler:
+                # no custom chat handler
+                pass
+            else:
+                # specific chat handlers may be needed for multimodal models
+                # see https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models
+                pass
+
+        # get context size from model instance:
+        self.context_size = self.model._n_ctx
+
+    def generate_response(self, messages: List[Dict], return_full_text: bool = False) -> Tuple[Any, Any, str]:
+        """
+        :param messages: for example
+                [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "Who won the world series in 2020?"},
+                    {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
+                    {"role": "user", "content": "Where was it played?"}
+                ]
+        :param return_full_text: If True, whole input context is returned.
+        :return: the continuation
+        """
+        # use llama.cpp jinja to apply chat template for prompt:
+        prompt_text = self.chat_formatter(messages=messages).prompt
+
+        prompt = {"inputs": prompt_text, "max_new_tokens": self.get_max_tokens(),
+                  "temperature": self.get_temperature(), "return_full_text": return_full_text}
+
+        prompt_tokens = self.model.tokenize(prompt_text.encode(), add_bos=False)  # BOS expected in template
+
+        # check context limit:
+        check_context_limit_generic(self.context_size, prompt_tokens, self.model_spec.model_name,
+                                    max_new_tokens=self.get_max_tokens())
+
+        # NOTE: HF transformers models come with their own generation configs, but llama.cpp doesn't seem to have a
+        # feature like that. There are default sampling parameters, and clembench only handles two of them so far, which
+        # are set accordingly. Other parameters use the llama-cpp-python default values for now.
+
+        # NOTE: llama.cpp has a set sampling order, which differs from that of HF transformers. The latter allows
+        # individual sampling orders defined in the generation config that comes with HF models.
+
+        model_output = self.model(
+            prompt_text,
+            temperature=self.get_temperature(),
+            max_tokens=self.get_max_tokens()
+        )
+
+        response = {'response': model_output}
+
+        # cull input context:
+        if not return_full_text:
+            response_text = model_output['choices'][0]['text'].strip()
+
+            if 'output_split_prefix' in self.model_spec:
+                response_text = response_text.rsplit(self.model_spec['output_split_prefix'], maxsplit=1)[1]
+
+            eos_len = len(self.model_spec['eos_to_cull'])
+
+            if response_text.endswith(self.model_spec['eos_to_cull']):
+                response_text = response_text[:-eos_len]
+
+        else:
+            response_text = prompt_text + model_output['choices'][0]['text'].strip()
+
+        return prompt, response, response_text
diff --git a/backends/model_registry.json b/backends/model_registry.json
@@ -462,5 +462,78 @@
     "huggingface_id": "google/gemma-7b-it",
     "premade_chat_template": true,
     "eos_to_cull": "<eos>"
+  },
+  {
+    "model_name": "Qwen1.5-0.5B-Chat-GGUF-q8",
+    "backend": "llamacpp",
+    "huggingface_id": "Qwen/Qwen1.5-0.5B-Chat-GGUF",
+    "filename": "*q8_0.gguf",
+    "premade_chat_template": true,
+    "bos_string": "<s>",
+    "eos_string": "<|im_end|>",
+    "eos_to_cull": "<|im_end|>"
+  },
+  {
+    "model_name": "CapybaraHermes-2.5-Mistral-7B-GGUF-q4",
+    "backend": "llamacpp",
+    "huggingface_id": "TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF",
+    "filename": "*q4_0.gguf",
+    "premade_chat_template": true,
+    "bos_string": "<s>",
+    "eos_string": "<|im_end|>",
+    "eos_to_cull": "<|im_end|>"
+  },
+  {
+    "model_name": "CapybaraHermes-2.5-Mistral-7B-GGUF-q5",
+    "backend": "llamacpp",
+    "huggingface_id": "TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF",
+    "filename": "*q5_0.gguf",
+    "premade_chat_template": true,
+    "bos_string": "<s>",
+    "eos_string": "<|im_end|>",
+    "eos_to_cull": "<|im_end|>"
+  },
+  {
+    "model_name": "CapybaraHermes-2.5-Mistral-7B-GGUF-q5-k-s",
+    "backend": "llamacpp",
+    "huggingface_id": "TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF",
+    "filename": "*q5_k_s.gguf",
+    "premade_chat_template": true,
+    "bos_string": "<s>",
+    "eos_string": "<|im_end|>",
+    "eos_to_cull": "<|im_end|>"
+  },
+  {
+    "model_name": "EstopianMaid-13B-GGUF-q2-k",
+    "backend": "llamacpp",
+    "huggingface_id": "TheBloke/EstopianMaid-13B-GGUF",
+    "filename": "*q2_k.gguf",
+    "premade_chat_template": false,
+    "custom_chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\\n\\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% if system_message %}{{ bos_token + system_message }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{bos_token + '### Instruction:\\n' + message['content'].strip() + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\\n' + message['content'].strip() + eos_token + '\\n\\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Response:\\n' }}{% endif %}{% endfor %}",
+    "bos_string": "<s>",
+    "eos_string": "</s>",
+    "eos_to_cull": "</s>"
+  },
+  {
+    "model_name": "EstopianMaid-13B-GGUF-q3-k-s",
+    "backend": "llamacpp",
+    "huggingface_id": "TheBloke/EstopianMaid-13B-GGUF",
+    "filename": "*q3_k_s.gguf",
+    "premade_chat_template": false,
+    "custom_chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() + '\\n\\n' %}{% else %}{% set loop_messages = messages %}{% set system_message = '' %}{% endif %}{% if system_message %}{{ bos_token + system_message }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{bos_token + '### Instruction:\\n' + message['content'].strip() + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\\n' + message['content'].strip() + eos_token + '\\n\\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ '### Response:\\n' }}{% endif %}{% endfor %}",
+    "bos_string": "<s>",
+    "eos_string": "</s>",
+    "eos_to_cull": "</s>"
+  },
+  {
+    "model_name": "openchat_3.5-GGUF-q5",
+    "backend": "llamacpp",
+    "huggingface_id": "TheBloke/openchat_3.5-GGUF",
+    "filename": "*q5_0.gguf",
+    "premade_chat_template": false,
+    "custom_chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+    "bos_string": "<s>",
+    "eos_string": "<|end_of_turn|>",
+    "eos_to_cull": "<|end_of_turn|>"
   }
 ]
diff --git a/backends/utils.py b/backends/utils.py
@@ -1,8 +1,8 @@
 import copy
 from functools import wraps
-from typing import List, Dict
+from typing import List, Dict, Tuple
 
-from backends import get_logger
+from backends import get_logger, ContextExceededError
 
 logger = get_logger(__name__)
 
@@ -60,3 +60,31 @@ def wrapped_fn(self, messages):
         return generate_response_fn(self, _messages)
 
     return wrapped_fn
+
+
+def check_context_limit_generic(context_size: int, prompt_tokens: List, model_name: str, max_new_tokens: int = 100) \
+        -> Tuple[bool, int, int, int]:
+    """
+    Internal context limit check to run in generate_response.
+    :param context_size: The
+    :param prompt_tokens: List of prompt token IDs.
+    :param model_name: Name of the model checked for.
+    :param max_new_tokens: How many tokens to generate ('at most', but no stop sequence is defined).
+    :return: Tuple with
+            Bool: True if context limit is not exceeded, False if too many tokens
+            Number of tokens for the given messages and maximum new tokens
+            Number of tokens of 'context space left'
+            Total context token limit
+    """
+    prompt_size = len(prompt_tokens)
+    tokens_used = prompt_size + max_new_tokens  # context includes tokens to be generated
+    tokens_left = context_size - tokens_used
+    fits = tokens_used <= context_size
+
+    if not fits:
+        logger.info(f"Context token limit for {model_name} exceeded: {tokens_used}/{tokens_left}")
+        # fail gracefully:
+        raise ContextExceededError(f"Context token limit for {model_name} exceeded",
+                                   tokens_used=tokens_used, tokens_left=tokens_left, context_size=context_size)
+
+    return fits, tokens_used, tokens_left, context_size
diff --git a/docs/howto_use_llama-cpp_backend.md b/docs/howto_use_llama-cpp_backend.md
@@ -0,0 +1,48 @@
+# Setup and usage of llama.cpp clembench backend
+This guide covers the installation and usage of the llama.cpp-based backend for clembench. This backend allows the use 
+of models in the GGUF format, supporting pre-quantized model versions and merged models. The setup varies by available 
+hardware backend and operating system, and models may need to be loaded with specific arguments depending on the setup.  
+## Content
+[Setup](#setup)  
+[Model loading](#model-loading)
+## Setup
+The clembench llama.cpp backend relies on the llama-cpp-python library, which wraps C++ llama.cpp. To allow the usage of 
+specific hardware, specially GPUs, the installation must include a fitting version of llama.cpp. This may entail 
+compiling llama.cpp, but pre-compiled versions for specific hardware are available.  
+Since this is specific to the available hardware, please refer to the [llama-cpp-python installation instructions](https://llama-cpp-python.readthedocs.io/en/latest/#installation) 
+to install the library. It is recommended to use one of the pre-built wheels for the available hardware, as this does not require a C++ compiler 
+and compiling llama.cpp during the installation.
+### Sample setup script
+The following example shell script installs the clembench llama.cpp backend with support for CUDA 12.2 GPUs:
+```shell
+# create separate venv for running the llama.cpp backend:
+python3 -m venv venv_llamacpp
+source venv_llamacpp/bin/activate
+# install basic clembench requirements:
+pip3 install -r requirements.txt
+# install llama-cpp-python using pre-built wheel with CUDA 12.2 support:
+pip3 install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122
+```
+## Model loading
+The clembench llama.cpp backend downloads model files from HuggingFace model repositories. See the [model registry readme](model_backend_registry_readme.md).  
+By default, the clembench llama.cpp backend loads all model layers onto the available GPU(s). This requires that during 
+setup, proper llama.cpp GPU support fitting the system hardware was installed.  
+Optionally, models can be loaded to run on CPU (using RAM instead of GPU VRAM). This is required if llama-cpp-python was 
+installed without GPU support. This can be done by passing a JSON object to the clembench CLI scripts, or a Python `dict` 
+to the model loading function of the clembench `backends`.  
+The JSON object/`dict` has to contain the model name as defined in the [model registry](model_backend_registry_readme.md) 
+and the key `execute_on` with string value `gpu` or `cpu`:
+```python
+model_on_gpu = {'model_name': "openchat_3.5-GGUF-q5", 'execute_on': "gpu"}
+model_on_cpu = {'model_name': "openchat_3.5-GGUF-q5", 'execute_on': "cpu"}
+```
+For clembench CLI scripts, the JSON object is given as a "-delimited string:
+```shell
+# run the taboo clemgame with openchat_3.5-GGUF-q5 on CPU:
+python3 scripts/cli.py run -g taboo -m "{'model_name': 'openchat_3.5-GGUF-q5', 'execute_on': 'cpu'}"
+```
+Alternatively, the number of model layers to offload to GPU can be set by using the `gpu_layers_offloaded` key with an 
+integer value:
+```python
+model_15_layers_on_gpu = {'model_name': "openchat_3.5-GGUF-q5", 'gpu_layers_offloaded': 15}
+```