premAI-io · Anindyadeep · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/bench_llamacpp/bench.py b/bench_llamacpp/bench.py
@@ -95,6 +95,16 @@ def postprocess(self, output: dict) -> str:
     )
 
     runner_dict = {
+        "cpu": [
+            {
+                "precision": "int4",
+                "model_path": os.path.join(model_folder, model_name + "Q4_K_M.gguf"),
+            },
+            {
+                "precision": "int8",
+                "model_path": os.path.join(model_folder, model_name + "Q8_0.gguf"),
+            },
+        ],
         "cuda": [
             {
                 "precision": "int4",

diff --git a/bench_llamacpp/bench.sh b/bench_llamacpp/bench.sh
@@ -74,6 +74,7 @@ setup() {
     bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME"
 }
 
+
 run_benchmarks() {
     local PROMPT="$1"
     local REPETITIONS="$2"
@@ -122,9 +123,6 @@ while [ "$#" -gt 0 ]; do
             esac
             if [ "$DEVICE" == "cuda" ]; then
                 check_cuda
-            else
-                echo "Not supported for $DEVICE"
-                exit 1
             fi
             shift 2
             ;;
@@ -152,5 +150,5 @@ MAX_TOKENS="${MAX_TOKENS:-512}"
 DEVICE="${DEVICE:-'cuda'}"
 MODEL_NAME="${MODEL_NAME:-"llama"}"
 
-setup "$DEVICE"
+setup "$DEVICE" "$MODEL_NAME"
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
diff --git a/bench_llamacpp/requirements.txt b/bench_llamacpp/requirements.txt
@@ -1,4 +1,4 @@
-llama_cpp_python==0.2.62
-huggingface_hub==0.22.2
-transformers==4.39.3
-torch==2.2.2
+huggingface_hub==0.24.2
+transformers==4.43.2
+torch==2.4.0
+psutil==6.0.0
diff --git a/bench_llamacpp/setup.sh b/bench_llamacpp/setup.sh
@@ -1,13 +1,21 @@
 #!/bin/bash
 
 ################################################################################
-# Script: setup.sh <DEVICE>
+# Script: setup.sh <DEVICE> <MODEL_NAME>
 # Description: Automates the setup of a virtual environment and installs project
 # requirements.
 ################################################################################
 
 set -euo pipefail
 
+# Set default folder paths for AWQ weights
+CURRENT_DIR="$(pwd)"
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+VENV_DIR="$SCRIPT_DIR/venv"
+MODELS_DIR="$CURRENT_DIR/models"
+LLAMA2_GGUF_WEIGHTS_DIR="$MODELS_DIR/llama-2-7b-chat-gguf"
+MISTRAL_GGUF_WEIGHTS_DIR="$MODELS_DIR/mistral-7b-v0.1-instruct-gguf"
+
 check_python() {
     if command -v python &> /dev/null; then
         PYTHON_CMD="python"
@@ -19,65 +27,57 @@ check_python() {
     fi
 }
 
-clone_and_build_llama() {
-    local DEVICE="$1"
-    local VENV_DIR="$2"
-    local SCRIPT_DIR="$3"
-
-    if [ "$#" -ne 3 ]; then
-        echo "Usage: $0 <DEVICE> <ENV> <SCRIPT_DIR>"
-        exit 1
-    fi
+download_gguf_weights() {
+    local MODEL_NAME="$1"
+    local DOWNLOAD_DIR
 
-    case "$DEVICE" in
-        cuda)
-            export LLAMA_CUBLAS=on
-            ;;
-        metal)
-            export LLAMA_METAL=on
+    case "$MODEL_NAME" in
+        llama)
+            DOWNLOAD_DIR="$LLAMA2_GGUF_WEIGHTS_DIR"
+            MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-GGUF"
+            MODEL_FILE_4BIT="llama-2-7b-chat.Q4_K_M.gguf"
+            MODEL_FILE_8BIT="llama-2-7b-chat.Q8_0.gguf"
             ;;
-        cpu)
-            return 0
+        mistral)
+            DOWNLOAD_DIR="$MISTRAL_GGUF_WEIGHTS_DIR"
+            MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
+            MODEL_FILE_4BIT="mistral-7b-instruct-v0.1.Q4_K_M.gguf"
+            MODEL_FILE_8BIT="mistral-7b-instruct-v0.1.Q8_0.gguf"
             ;;
         *)
-            echo "Unsupported DEVICE: $DEVICE"
-            return 1
+            echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
+            exit 1
             ;;
     esac
 
-    local LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so"
-
-    if [ -e "$LIBLLAMA_FILE" ]; then
-        echo "File $LIBLLAMA_FILE exists."
-        exit 0
-    fi
-
-    # Remove existing llama.cpp directory if it exists
-    if [ -d "$SCRIPT_DIR/llama.cpp" ]; then
-        echo "Removing existing llama.cpp directory..."
-        rm -rf "$SCRIPT_DIR"/llama.cpp
+    if [ ! -d "$DOWNLOAD_DIR" ]; then
+        huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_4BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False
+        huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_8BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False
+    else
+        echo "Weights for $MODEL_NAME already downloaded."
     fi
+}
 
-    git clone --depth=1 https://github.com/ggerganov/llama.cpp "$SCRIPT_DIR"/llama.cpp
-    cd "$SCRIPT_DIR"/llama.cpp
+clone_and_build_llama() {
+    local DEVICE="$1"
 
-    # Build llama.cpp
-    make clean > /dev/null
     echo "Building llama.cpp..."
-    make libllama.so > /dev/null
-    cp libllama.so "$LIBLLAMA_FILE"
-    cd "$SCRIPT_DIR"
-
-    rm -rf "$SCRIPT_DIR"/llama.cpp
+    if [ "$DEVICE" == "cuda" ]; then
+        pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+    else
+        pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+    fi
 }
 
 # CLI Args
 DEVICE="$1"
+MODEL_NAME="$2"
 
 # Define directory paths
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VENV_DIR="$SCRIPT_DIR/venv"
-LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so"
 
 check_python
 
@@ -87,10 +87,13 @@ if [ ! -d "$VENV_DIR" ]; then
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
     pip install --upgrade pip > /dev/null
+    clone_and_build_llama "$DEVICE"
     pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
+    pip install numpy --upgrade 
 else
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
+
 fi
-
-clone_and_build_llama "$DEVICE" "$VENV_DIR" "$SCRIPT_DIR"
+ 
+download_gguf_weights "$MODEL_NAME"
diff --git a/common/base.py b/common/base.py
@@ -1,9 +1,11 @@
 import json
 import os
+import psutil 
 from abc import ABC, abstractmethod
 
 from tqdm.auto import tqdm
 
+import time
 from common.memory_tracker import MemoryTracker
 from common.utils import get_logger
 
@@ -171,6 +173,28 @@ def _benchmark_cuda(self, prompt: str, max_tokens: int, temperature: float):
         gpu_mem_consumed = round(peak_nvml_mb, 2)
 
         return (token_per_sec, gpu_mem_consumed)
+
+    def _benchmark_cpu(self, prompt: str, max_tokens: int, temperature: float = 0.1):
+        process = psutil.Process()
+        initial_memory_info = process.memory_info()
+
+        inputs = self.preprocess(prompt=prompt, for_benchmarks=True)
+
+        temperature = 0.1 if temperature is None else temperature
+
+        start_time = time.time()
+        output_dict = self.run_model(inputs, max_tokens, temperature)
+        end_time = time.time()
+
+        final_memory_info = process.memory_info()
+        memory_used = final_memory_info.rss - initial_memory_info.rss
+        elapsed_time = end_time - start_time
+
+        num_output_tokens = output_dict["num_output_tokens"]
+        tokens_per_sec = num_output_tokens / elapsed_time
+
+        return (tokens_per_sec, memory_used)
+
 
     def benchmark(
         self, prompt: str, max_tokens: int, repetitions: int, temperature: float = 0.1
@@ -186,6 +210,13 @@ def benchmark(
                 )
                 self.tps_results.append(tok_per_sec)
                 self.memory_usage_results.append(gpu_memory_consumed)
+
+            elif self.device == "cpu":
+                token_per_sec, cpu_memory_used = self._benchmark_cpu(
+                    prompt=prompt, max_tokens=max_tokens, temperature=temperature
+                ) 
+                self.tps_results.append(token_per_sec)
+                self.memory_usage_results.append(cpu_memory_used)
             else:
                 raise NotImplementedError(
                     "For other device base benchmark is not implemented"