Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llamacpp cpu support #190

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions bench_llamacpp/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ def postprocess(self, output: dict) -> str:
)

runner_dict = {
"cpu": [
{
"precision": "int4",
"model_path": os.path.join(model_folder, model_name + "Q4_K_M.gguf"),
},
{
"precision": "int8",
"model_path": os.path.join(model_folder, model_name + "Q8_0.gguf"),
},
],
"cuda": [
{
"precision": "int4",
Expand Down
6 changes: 2 additions & 4 deletions bench_llamacpp/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ setup() {
bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME"
}


run_benchmarks() {
local PROMPT="$1"
local REPETITIONS="$2"
Expand Down Expand Up @@ -122,9 +123,6 @@ while [ "$#" -gt 0 ]; do
esac
if [ "$DEVICE" == "cuda" ]; then
check_cuda
else
echo "Not supported for $DEVICE"
exit 1
fi
shift 2
;;
Expand Down Expand Up @@ -152,5 +150,5 @@ MAX_TOKENS="${MAX_TOKENS:-512}"
DEVICE="${DEVICE:-'cuda'}"
MODEL_NAME="${MODEL_NAME:-"llama"}"

setup "$DEVICE"
setup "$DEVICE" "$MODEL_NAME"
run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
8 changes: 4 additions & 4 deletions bench_llamacpp/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
llama_cpp_python==0.2.62
huggingface_hub==0.22.2
transformers==4.39.3
torch==2.2.2
huggingface_hub==0.24.2
transformers==4.43.2
torch==2.4.0
psutil==6.0.0
89 changes: 46 additions & 43 deletions bench_llamacpp/setup.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
#!/bin/bash

################################################################################
# Script: setup.sh <DEVICE>
# Script: setup.sh <DEVICE> <MODEL_NAME>
# Description: Automates the setup of a virtual environment and installs project
# requirements.
################################################################################

set -euo pipefail

# Set default folder paths for AWQ weights
CURRENT_DIR="$(pwd)"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
VENV_DIR="$SCRIPT_DIR/venv"
MODELS_DIR="$CURRENT_DIR/models"
LLAMA2_GGUF_WEIGHTS_DIR="$MODELS_DIR/llama-2-7b-chat-gguf"
MISTRAL_GGUF_WEIGHTS_DIR="$MODELS_DIR/mistral-7b-v0.1-instruct-gguf"

check_python() {
if command -v python &> /dev/null; then
PYTHON_CMD="python"
Expand All @@ -19,65 +27,57 @@ check_python() {
fi
}

clone_and_build_llama() {
local DEVICE="$1"
local VENV_DIR="$2"
local SCRIPT_DIR="$3"

if [ "$#" -ne 3 ]; then
echo "Usage: $0 <DEVICE> <ENV> <SCRIPT_DIR>"
exit 1
fi
download_gguf_weights() {
local MODEL_NAME="$1"
local DOWNLOAD_DIR

case "$DEVICE" in
cuda)
export LLAMA_CUBLAS=on
;;
metal)
export LLAMA_METAL=on
case "$MODEL_NAME" in
llama)
DOWNLOAD_DIR="$LLAMA2_GGUF_WEIGHTS_DIR"
MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-GGUF"
MODEL_FILE_4BIT="llama-2-7b-chat.Q4_K_M.gguf"
MODEL_FILE_8BIT="llama-2-7b-chat.Q8_0.gguf"
;;
cpu)
return 0
mistral)
DOWNLOAD_DIR="$MISTRAL_GGUF_WEIGHTS_DIR"
MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
MODEL_FILE_4BIT="mistral-7b-instruct-v0.1.Q4_K_M.gguf"
MODEL_FILE_8BIT="mistral-7b-instruct-v0.1.Q8_0.gguf"
;;
*)
echo "Unsupported DEVICE: $DEVICE"
return 1
echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
exit 1
;;
esac

local LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so"

if [ -e "$LIBLLAMA_FILE" ]; then
echo "File $LIBLLAMA_FILE exists."
exit 0
fi

# Remove existing llama.cpp directory if it exists
if [ -d "$SCRIPT_DIR/llama.cpp" ]; then
echo "Removing existing llama.cpp directory..."
rm -rf "$SCRIPT_DIR"/llama.cpp
if [ ! -d "$DOWNLOAD_DIR" ]; then
huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_4BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False
huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_8BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False
else
echo "Weights for $MODEL_NAME already downloaded."
fi
}

git clone --depth=1 https://github.com/ggerganov/llama.cpp "$SCRIPT_DIR"/llama.cpp
cd "$SCRIPT_DIR"/llama.cpp
clone_and_build_llama() {
local DEVICE="$1"

# Build llama.cpp
make clean > /dev/null
echo "Building llama.cpp..."
make libllama.so > /dev/null
cp libllama.so "$LIBLLAMA_FILE"
cd "$SCRIPT_DIR"

rm -rf "$SCRIPT_DIR"/llama.cpp
if [ "$DEVICE" == "cuda" ]; then
pip install llama-cpp-python \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
else
pip install llama-cpp-python \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
fi
}

# CLI Args
DEVICE="$1"
MODEL_NAME="$2"

# Define directory paths
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENV_DIR="$SCRIPT_DIR/venv"
LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so"

check_python

Expand All @@ -87,10 +87,13 @@ if [ ! -d "$VENV_DIR" ]; then
# shellcheck disable=SC1091
source "$VENV_DIR/bin/activate"
pip install --upgrade pip > /dev/null
clone_and_build_llama "$DEVICE"
pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
pip install numpy --upgrade
else
# shellcheck disable=SC1091
source "$VENV_DIR/bin/activate"

fi

clone_and_build_llama "$DEVICE" "$VENV_DIR" "$SCRIPT_DIR"
download_gguf_weights "$MODEL_NAME"
31 changes: 31 additions & 0 deletions common/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import json
import os
import psutil
from abc import ABC, abstractmethod

from tqdm.auto import tqdm

import time
from common.memory_tracker import MemoryTracker
from common.utils import get_logger

Expand Down Expand Up @@ -171,6 +173,28 @@ def _benchmark_cuda(self, prompt: str, max_tokens: int, temperature: float):
gpu_mem_consumed = round(peak_nvml_mb, 2)

return (token_per_sec, gpu_mem_consumed)

def _benchmark_cpu(self, prompt: str, max_tokens: int, temperature: float = 0.1):
process = psutil.Process()
initial_memory_info = process.memory_info()

inputs = self.preprocess(prompt=prompt, for_benchmarks=True)

temperature = 0.1 if temperature is None else temperature

start_time = time.time()
output_dict = self.run_model(inputs, max_tokens, temperature)
end_time = time.time()

final_memory_info = process.memory_info()
memory_used = final_memory_info.rss - initial_memory_info.rss
elapsed_time = end_time - start_time

num_output_tokens = output_dict["num_output_tokens"]
tokens_per_sec = num_output_tokens / elapsed_time

return (tokens_per_sec, memory_used)


def benchmark(
self, prompt: str, max_tokens: int, repetitions: int, temperature: float = 0.1
Expand All @@ -186,6 +210,13 @@ def benchmark(
)
self.tps_results.append(tok_per_sec)
self.memory_usage_results.append(gpu_memory_consumed)

elif self.device == "cpu":
token_per_sec, cpu_memory_used = self._benchmark_cpu(
prompt=prompt, max_tokens=max_tokens, temperature=temperature
)
self.tps_results.append(token_per_sec)
self.memory_usage_results.append(cpu_memory_used)
else:
raise NotImplementedError(
"For other device base benchmark is not implemented"
Expand Down
Loading