forked from clp-research/clembench
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'mergefork' of https://github.com/kushal-10/clembench in…
…to mergefork
- Loading branch information
Showing
9 changed files
with
385 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
""" | ||
Backend using llama.cpp for GGUF/GGML models. | ||
""" | ||
|
||
from typing import List, Dict, Tuple, Any | ||
|
||
import backends | ||
from backends.utils import check_context_limit_generic | ||
|
||
import llama_cpp | ||
from llama_cpp import Llama | ||
|
||
logger = backends.get_logger(__name__) | ||
|
||
|
||
def load_model(model_spec: backends.ModelSpec) -> Any: | ||
""" | ||
Load GGUF/GGML model weights from HuggingFace, into VRAM if available. Weights are distributed over all available | ||
GPUs for maximum speed - make sure to limit the available GPUs using environment variables if only a subset is to be | ||
used. | ||
:param model_spec: The ModelSpec for the model. | ||
:return: The llama_cpp model class instance of the loaded model. | ||
""" | ||
logger.info(f'Start loading llama.cpp model weights from HuggingFace: {model_spec.model_name}') | ||
|
||
hf_repo_id = model_spec['huggingface_id'] | ||
hf_model_file = model_spec['filename'] | ||
|
||
# default to GPU offload: | ||
gpu_layers_offloaded = -1 # -1 = offload all model layers to GPU | ||
# check for optional execute_on flag: | ||
if hasattr(model_spec, 'execute_on'): | ||
if model_spec.execute_on == "gpu": | ||
gpu_layers_offloaded = -1 | ||
elif model_spec.execute_on == "cpu": | ||
gpu_layers_offloaded = 0 | ||
# check for optional gpu_layers_offloaded value: | ||
elif hasattr(model_spec, 'gpu_layers_offloaded'): | ||
gpu_layers_offloaded = model_spec.gpu_layers_offloaded | ||
|
||
if 'requires_api_key' in model_spec and model_spec['requires_api_key']: | ||
# load HF API key: | ||
creds = backends.load_credentials("huggingface") | ||
api_key = creds["huggingface"]["api_key"] | ||
model = Llama.from_pretrained(hf_repo_id, hf_model_file, token=api_key, verbose=False, | ||
n_gpu_layers=gpu_layers_offloaded, n_ctx=0) | ||
else: | ||
model = Llama.from_pretrained(hf_repo_id, hf_model_file, verbose=False, n_gpu_layers=gpu_layers_offloaded, | ||
n_ctx=0) | ||
|
||
logger.info(f"Finished loading llama.cpp model: {model_spec.model_name}") | ||
|
||
return model | ||
|
||
|
||
def get_chat_formatter(model: Llama, model_spec: backends.ModelSpec) -> llama_cpp.llama_chat_format.Jinja2ChatFormatter: | ||
# placeholders for BOS/EOS: | ||
bos_string = None | ||
eos_string = None | ||
|
||
# check chat template: | ||
if model_spec.premade_chat_template: | ||
# jinja chat template available in metadata | ||
chat_template = model.metadata['tokenizer.chat_template'] | ||
else: | ||
chat_template = model_spec.custom_chat_template | ||
|
||
if hasattr(model, 'chat_format'): | ||
if not model.chat_format: | ||
# no guessed chat format | ||
pass | ||
else: | ||
if model.chat_format == "chatml": | ||
# get BOS/EOS strings for chatml from llama.cpp: | ||
bos_string = llama_cpp.llama_chat_format.CHATML_BOS_TOKEN | ||
eos_string = llama_cpp.llama_chat_format.CHATML_EOS_TOKEN | ||
elif model.chat_format == "mistral-instruct": | ||
# get BOS/EOS strings for mistral-instruct from llama.cpp: | ||
bos_string = llama_cpp.llama_chat_format.MISTRAL_INSTRUCT_BOS_TOKEN | ||
eos_string = llama_cpp.llama_chat_format.MISTRAL_INSTRUCT_EOS_TOKEN | ||
|
||
# get BOS/EOS token string from model file: | ||
# NOTE: These may not be the expected tokens, checking these when model is added is likely necessary! | ||
if "tokenizer.ggml.bos_token_id" in model.metadata: | ||
bos_string = model._model.token_get_text(int(model.metadata.get("tokenizer.ggml.bos_token_id"))) | ||
if "tokenizer.ggml.eos_token_id" in model.metadata: | ||
eos_string = model._model.token_get_text(int(model.metadata.get("tokenizer.ggml.eos_token_id"))) | ||
|
||
# get BOS/EOS strings for template from registry if not available from model file: | ||
if not bos_string: | ||
bos_string = model_spec.bos_string | ||
if not eos_string: | ||
eos_string = model_spec.eos_string | ||
|
||
# init llama-cpp-python jinja chat formatter: | ||
chat_formatter = llama_cpp.llama_chat_format.Jinja2ChatFormatter( | ||
template=chat_template, | ||
bos_token=bos_string, | ||
eos_token=eos_string | ||
) | ||
|
||
return chat_formatter | ||
|
||
|
||
class LlamaCPPLocal(backends.Backend): | ||
""" | ||
Model/backend handler class for locally-run GGUF/GGML models. | ||
""" | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def get_model_for(self, model_spec: backends.ModelSpec) -> backends.Model: | ||
""" | ||
Get a LlamaCPPLocalModel instance with the passed model and settings. Will load all required data for using | ||
the model upon initialization. | ||
:param model_spec: The ModelSpec for the model. | ||
:return: The Model class instance of the model. | ||
""" | ||
return LlamaCPPLocalModel(model_spec) | ||
|
||
|
||
class LlamaCPPLocalModel(backends.Model): | ||
""" | ||
Class for loaded llama.cpp models ready for generation. | ||
""" | ||
def __init__(self, model_spec: backends.ModelSpec): | ||
super().__init__(model_spec) | ||
self.model = load_model(model_spec) | ||
|
||
self.chat_formatter = get_chat_formatter(self.model, model_spec) | ||
|
||
if hasattr(self.model, 'chat_handler'): | ||
if not self.model.chat_handler: | ||
# no custom chat handler | ||
pass | ||
else: | ||
# specific chat handlers may be needed for multimodal models | ||
# see https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models | ||
pass | ||
|
||
# get context size from model instance: | ||
self.context_size = self.model._n_ctx | ||
|
||
def generate_response(self, messages: List[Dict], return_full_text: bool = False) -> Tuple[Any, Any, str]: | ||
""" | ||
:param messages: for example | ||
[ | ||
{"role": "system", "content": "You are a helpful assistant."}, | ||
{"role": "user", "content": "Who won the world series in 2020?"}, | ||
{"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}, | ||
{"role": "user", "content": "Where was it played?"} | ||
] | ||
:param return_full_text: If True, whole input context is returned. | ||
:return: the continuation | ||
""" | ||
# use llama.cpp jinja to apply chat template for prompt: | ||
prompt_text = self.chat_formatter(messages=messages).prompt | ||
|
||
prompt = {"inputs": prompt_text, "max_new_tokens": self.get_max_tokens(), | ||
"temperature": self.get_temperature(), "return_full_text": return_full_text} | ||
|
||
prompt_tokens = self.model.tokenize(prompt_text.encode(), add_bos=False) # BOS expected in template | ||
|
||
# check context limit: | ||
check_context_limit_generic(self.context_size, prompt_tokens, self.model_spec.model_name, | ||
max_new_tokens=self.get_max_tokens()) | ||
|
||
# NOTE: HF transformers models come with their own generation configs, but llama.cpp doesn't seem to have a | ||
# feature like that. There are default sampling parameters, and clembench only handles two of them so far, which | ||
# are set accordingly. Other parameters use the llama-cpp-python default values for now. | ||
|
||
# NOTE: llama.cpp has a set sampling order, which differs from that of HF transformers. The latter allows | ||
# individual sampling orders defined in the generation config that comes with HF models. | ||
|
||
model_output = self.model( | ||
prompt_text, | ||
temperature=self.get_temperature(), | ||
max_tokens=self.get_max_tokens() | ||
) | ||
|
||
response = {'response': model_output} | ||
|
||
# cull input context: | ||
if not return_full_text: | ||
response_text = model_output['choices'][0]['text'].strip() | ||
|
||
if 'output_split_prefix' in self.model_spec: | ||
response_text = response_text.rsplit(self.model_spec['output_split_prefix'], maxsplit=1)[1] | ||
|
||
eos_len = len(self.model_spec['eos_to_cull']) | ||
|
||
if response_text.endswith(self.model_spec['eos_to_cull']): | ||
response_text = response_text[:-eos_len] | ||
|
||
else: | ||
response_text = prompt_text + model_output['choices'][0]['text'].strip() | ||
|
||
return prompt, response, response_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Setup and usage of llama.cpp clembench backend | ||
This guide covers the installation and usage of the llama.cpp-based backend for clembench. This backend allows the use | ||
of models in the GGUF format, supporting pre-quantized model versions and merged models. The setup varies by available | ||
hardware backend and operating system, and models may need to be loaded with specific arguments depending on the setup. | ||
## Content | ||
[Setup](#setup) | ||
[Model loading](#model-loading) | ||
## Setup | ||
The clembench llama.cpp backend relies on the llama-cpp-python library, which wraps C++ llama.cpp. To allow the usage of | ||
specific hardware, specially GPUs, the installation must include a fitting version of llama.cpp. This may entail | ||
compiling llama.cpp, but pre-compiled versions for specific hardware are available. | ||
Since this is specific to the available hardware, please refer to the [llama-cpp-python installation instructions](https://llama-cpp-python.readthedocs.io/en/latest/#installation) | ||
to install the library. It is recommended to use one of the pre-built wheels for the available hardware, as this does not require a C++ compiler | ||
and compiling llama.cpp during the installation. | ||
### Sample setup script | ||
The following example shell script installs the clembench llama.cpp backend with support for CUDA 12.2 GPUs: | ||
```shell | ||
# create separate venv for running the llama.cpp backend: | ||
python3 -m venv venv_llamacpp | ||
source venv_llamacpp/bin/activate | ||
# install basic clembench requirements: | ||
pip3 install -r requirements.txt | ||
# install llama-cpp-python using pre-built wheel with CUDA 12.2 support: | ||
pip3 install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 | ||
``` | ||
## Model loading | ||
The clembench llama.cpp backend downloads model files from HuggingFace model repositories. See the [model registry readme](model_backend_registry_readme.md). | ||
By default, the clembench llama.cpp backend loads all model layers onto the available GPU(s). This requires that during | ||
setup, proper llama.cpp GPU support fitting the system hardware was installed. | ||
Optionally, models can be loaded to run on CPU (using RAM instead of GPU VRAM). This is required if llama-cpp-python was | ||
installed without GPU support. This can be done by passing a JSON object to the clembench CLI scripts, or a Python `dict` | ||
to the model loading function of the clembench `backends`. | ||
The JSON object/`dict` has to contain the model name as defined in the [model registry](model_backend_registry_readme.md) | ||
and the key `execute_on` with string value `gpu` or `cpu`: | ||
```python | ||
model_on_gpu = {'model_name': "openchat_3.5-GGUF-q5", 'execute_on': "gpu"} | ||
model_on_cpu = {'model_name': "openchat_3.5-GGUF-q5", 'execute_on': "cpu"} | ||
``` | ||
For clembench CLI scripts, the JSON object is given as a "-delimited string: | ||
```shell | ||
# run the taboo clemgame with openchat_3.5-GGUF-q5 on CPU: | ||
python3 scripts/cli.py run -g taboo -m "{'model_name': 'openchat_3.5-GGUF-q5', 'execute_on': 'cpu'}" | ||
``` | ||
Alternatively, the number of model layers to offload to GPU can be set by using the `gpu_layers_offloaded` key with an | ||
integer value: | ||
```python | ||
model_15_layers_on_gpu = {'model_name': "openchat_3.5-GGUF-q5", 'gpu_layers_offloaded': 15} | ||
``` |
Oops, something went wrong.