Merge pull request #26 from premAI-io/main

Merge from main
premAI-io · Jan 29, 2024 · 747b6ca · 747b6ca
2 parents c804d79 + 795a811
commit 747b6ca
Show file tree

Hide file tree

Showing 13 changed files with 248 additions and 108 deletions.
diff --git a/README.md b/README.md
@@ -38,12 +38,12 @@ Take a first glance of Llama-2-7B Model Performance Metrics Across Different Pre
 | tinygrad                     |      -       | 20.32 ± 0.06   |      -        |      -        |
 | onnx                         |      -       | 54.16 ± 3.15   |      -        |      -        |
 | transformers (pytorch)       | 43.79 ± 0.61 | 46.39 ± 0.28   | 6.98 ± 0.05   | 21.72 ± 0.11  |
-| vllm                         | 91.23 ± 3.61 | 91.44 ± 3.83   |      -        | 113.38 ± 11.70|
-| exllamav2                    |      -       |      -         | 116.91 ± 1.73 | 164.28 ± 4.07 |
+| vllm                         | 90.78 ± 1.60 | 90.54 ± 2.22   |      -        |      -        |
+| exllamav2                    |      -       |      -         | 121.63 ± 0.74 | 130.16 ± 0.35 |
 | ctransformers                |      -       |      -         | 76.75 ± 10.36 | 84.26 ± 5.79  |
 | AutoGPTQ                     | 42.01 ± 1.03 | 30.24 ± 0.41   |      -        |      -        |
-| AutoAWQ                      |      -       |      -         |      -        | 116.94 ± 13.14|
-| DeepSpeed                    |      -       | 83.54 ± 5.25   |      -        |               |
+| AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
+| DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
 | Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |

diff --git a/bench_autoawq/README.md b/bench_autoawq/README.md
@@ -0,0 +1,36 @@
+# AutoAWQ
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/casper-hansen/AutoAWQ) &nbsp;
+[![ArXiv](https://img.shields.io/badge/arXiv-%230170FE.svg?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2306.00978)
+
+
+[AutoAWQ](https://github.com/casper-hansen/AutoAWQ) is a package that is a polished implemementation of the original work [llm-awq](https://github.com/mit-han-lab/llm-awq) from MIT. AWQ or Activation Aware Quantization is a quantization method which supports 4-bit quantization. It massively increases the inference throughput and decreases the memory requirement of the model at the same time. (For example, according to this [reference](https://huggingface.co/TheBloke/Llama-2-70B-Chat-AWQ), Llama2 70B requires 2 x 80 GB but with AutoAWQ it can be run on 1 x 48 GB GPU). You can learn more about AWQ on the research paper and the github implementations.
+
+### 🚀 Running the AutoAWQ Benchmark.
+
+You can run the AutoAWQ benchmark using the following command:
+
+```bash
+./bench_autoawq/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which AWQ model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_autoawq/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_autoawq/bench.sh) file) and do the benchmarks. You can find all the benchmarks results for AutoAWQ [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. AutoAWQ is not supported devices other than GPU (only supports when CUDA is available).
+2. We are independently benchmarking AutoAWQ (i.e. the actual AWQ quantization method here). We are not benchmarking with combinations like: AutoAWQ + VLLM or AutoAWQ + TensorRT.
+3. For doing this benchmark, the default model that was choosen was: [Llama2-AutoAWQ by The Bloke](https://huggingface.co/TheBloke/Llama-2-7B-AWQ)
+4. AutoAWQ does not support INT8 quantization properly yet. See [this issue](https://github.com/casper-hansen/AutoAWQ/issues/45).
diff --git a/bench_autoawq/bench.py b/bench_autoawq/bench.py
@@ -19,10 +19,8 @@
 
 class LlamaAutoAWQBenchmark:
     def __init__(self, model_path: str, precision: int, device: str) -> None:
-        assert precision in ["fp16"], "For benchmarks supported precision is in FP16."
-        assert (
-            device == "cuda"
-        ), "Since it's an optimization for FP-16, CPU not supported."
+        assert device == "cuda", "Device other than CUDA is not supported for autoawq."
+        assert precision == "int4", "Precison other than INT4 is not supported."
 
         self.model_path, self.precision, self.device = (
             model_path,
@@ -99,11 +97,10 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
     )
     report = defaultdict(lambda: defaultdict(float))
 
-    # Hardcoding precision to fp16 for AutoAWQ
-    precision = 16
+    precision = 4
 
     if args.device == "cpu":
-        logging.info("Skipping running model on fp16 on CPU, not implemented for Half")
+        logging.info("Skipping running model on int4 on CPU, not implemented for Half")
         pass
     else:
         logging.info(
@@ -112,15 +109,15 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
         llama_autogptq_benchmark = LlamaAutoAWQBenchmark(
             model_path=f"{args.models_dir}/llama-2-7b-autoawq",
             device=args.device,
-            precision=f"fp{precision}",
+            precision=f"int{precision}",
         ).load_model()
         llama_autogptq_benchmark.benchmark(
             max_tokens=args.max_tokens,
             prompt=args.prompt,
             repetitions=args.repetitions,
         )
 
-        report["Llama AutoAWQ"][f"FP-{precision}"] = {
+        report["Llama AutoAWQ"][f"INT-{precision}"] = {
             "mean": np.mean(llama_autogptq_benchmark.results),
             "std": np.std(llama_autogptq_benchmark.results),
         }

diff --git a/bench_autoawq/bench.sh b/bench_autoawq/bench.sh
@@ -6,26 +6,27 @@
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
 ########################################################################################################
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
-    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
@@ -69,6 +70,18 @@ check_python() {
 
 
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh
 }
@@ -142,15 +155,16 @@ while [ "$#" -gt 0 ]; do
     esac
 done
 
+check_platform
+check_python
+setup
+
 # Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
 REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_autoawq_$(date +'%Y%m%d%H%M%S').log"}"
 MODELS_DIR="${MODELS_DIR:-"./models"}"
 
-check_platform
-check_python
-setup
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
diff --git a/bench_exllamav2/README.md b/bench_exllamav2/README.md
@@ -0,0 +1,34 @@
+# ExLlamaV2
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/turboderp/exllamav2) &nbsp;
+
+[ExLlamaV2](https://github.com/turboderp/exllamav2) uses custom Kernels to speed up LLM inference under different quantizations. ExLlamaV2 supports a new "EXL2" format. EXL2 is based on the same optimization method as GPTQ and supports 2, 3, 4, 5, 6 and 8-bit quantization. For this benchmark implementation, we use 4-bit and 8-bit quantization version of Llama2.
+
+
+### 🚀 Running the ExLlamaV2 Benchmark.
+
+You can run the ExLlamaV2 benchmark using the following command:
+
+```bash
+./bench_exllamav2/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_exllamav2/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_exllamav2/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for ExLlamaV2 [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. ExLlamaV2 supports quantized LLMs. So Float32/16 is not supported here.
+2. ExLlamaV2 currently [does not have support](https://github.com/turboderp/exllamav2/issues/184) for Mac/Metal.
+3. Although it supports CPU, but it is too slow to offload and run. So we did not include in our benchmarks.
diff --git a/bench_exllamav2/bench.py b/bench_exllamav2/bench.py
@@ -3,12 +3,13 @@
 import sys
 import time
 from collections import defaultdict
-from dataclasses import dataclass
 
 import numpy as np
 import torch
-from exllamav2 import ExLlamaV2Cache, model_init
+from exllamav2 import ExLlamaV2, ExLlamaV2Cache
+from exllamav2.config import ExLlamaV2Config
 from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
+from exllamav2.tokenizer import ExLlamaV2Tokenizer
 
 logging.getLogger("llama_cpp").setLevel(logging.ERROR)
 logging.basicConfig(
@@ -18,49 +19,34 @@
 )
 
 
-@dataclass
-class ExtraConfig:
-    model_dir: str
-    length: int = 2048
-    rope_scale: float = 1.0
-    rope_alpha: float = 1.0
-    no_flash_attn: bool = False
-    low_mem: bool = False
-    gpu_split: str = None
-
-
 class ExllamaV2Benchmark:
     def __init__(self, model_path: str) -> None:
-        self.model_path = model_path
-        self.cache = None
-        self.results = []
+        self.model_path, self.results = model_path, []
 
     def load_model(self):
-        self.model, self.tokenizer = model_init.init(
-            ExtraConfig(model_dir=self.model_path), allow_auto_split=True
-        )
+        self.config = ExLlamaV2Config()
+        self.config.model_dir = self.model_path
+        self.config.prepare()
+
+        self.model = ExLlamaV2(self.config)
+        self.cache = ExLlamaV2Cache(self.model, lazy=True)
+        self.model.load_autosplit(self.cache)
+        self.tokenizer = ExLlamaV2Tokenizer(self.config)
+
+        self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
         self.settings = ExLlamaV2Sampler.Settings()
         self.settings.temperature = 0.85
         self.settings.top_k = 50
         self.settings.top_p = 0.8
-        self.settings.token_repetition_penalty = 1.15
-
-        if not self.model.loaded:
-            self.cache = ExLlamaV2Cache(self.model)
-            self.model.load_autosplit(self.cache)
-            self.cache = None
-        self.cache = ExLlamaV2Cache(self.model)
-        self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
+        self.settings.token_repetition_penalty = 1.05
         self.settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
         self.generator.warmup()
         return self
 
     @torch.inference_mode()
     def run_model(self, prompt: str, max_tokens: int) -> float:
         start = time.time()
-        _ = self.generator.generate_simple(
-            prompt, self.settings, max_tokens, token_healing=True
-        )
+        _ = self.generator.generate_simple(prompt, self.settings, max_tokens, seed=1234)
         delta = time.time() - start
         return len(self.generator.sequence_ids[0]) / delta