Skip to content

Commit

Permalink
Merge pull request #26 from premAI-io/main
Browse files Browse the repository at this point in the history
Merge from main
  • Loading branch information
Anindyadeep authored Jan 29, 2024
2 parents c804d79 + 795a811 commit 747b6ca
Show file tree
Hide file tree
Showing 13 changed files with 248 additions and 108 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ Take a first glance of Llama-2-7B Model Performance Metrics Across Different Pre
| tinygrad | - | 20.32 ± 0.06 | - | - |
| onnx | - | 54.16 ± 3.15 | - | - |
| transformers (pytorch) | 43.79 ± 0.61 | 46.39 ± 0.28 | 6.98 ± 0.05 | 21.72 ± 0.11 |
| vllm | 91.23 ± 3.61 | 91.44 ± 3.83 | - | 113.38 ± 11.70|
| exllamav2 | - | - | 116.91 ± 1.73 | 164.28 ± 4.07 |
| vllm | 90.78 ± 1.60 | 90.54 ± 2.22 | - | - |
| exllamav2 | - | - | 121.63 ± 0.74 | 130.16 ± 0.35 |
| ctransformers | - | - | 76.75 ± 10.36 | 84.26 ± 5.79 |
| AutoGPTQ | 42.01 ± 1.03 | 30.24 ± 0.41 | - | - |
| AutoAWQ | - | - | - | 116.94 ± 13.14|
| DeepSpeed | - | 83.54 ± 5.25 | - | |
| AutoAWQ | - | - | - | 109.20 ± 3.28 |
| DeepSpeed | - | 81.44 ± 8.13 | - | |
| PyTorch Lightning | 24.85 ± 0.07 | 44.56 ± 2.89 | 10.50 ± 0.12 | 24.83 ± 0.05 |
| Optimum Nvidia | 110.36 ± 0.52| 109.09 ± 4.26 | - | - |
| Nvidia TensorRT-LLM | 60.39 ± 0.62 | 101.94 ± 8.34 | - | - |
Expand Down
36 changes: 36 additions & 0 deletions bench_autoawq/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# AutoAWQ

[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/casper-hansen/AutoAWQ)  
[![ArXiv](https://img.shields.io/badge/arXiv-%230170FE.svg?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2306.00978)


[AutoAWQ](https://github.com/casper-hansen/AutoAWQ) is a package that is a polished implemementation of the original work [llm-awq](https://github.com/mit-han-lab/llm-awq) from MIT. AWQ or Activation Aware Quantization is a quantization method which supports 4-bit quantization. It massively increases the inference throughput and decreases the memory requirement of the model at the same time. (For example, according to this [reference](https://huggingface.co/TheBloke/Llama-2-70B-Chat-AWQ), Llama2 70B requires 2 x 80 GB but with AutoAWQ it can be run on 1 x 48 GB GPU). You can learn more about AWQ on the research paper and the github implementations.

### 🚀 Running the AutoAWQ Benchmark.

You can run the AutoAWQ benchmark using the following command:

```bash
./bench_autoawq/bench.sh \
--prompt <value> \ # Enter a prompt string
--max_tokens <value> \ # Maximum number of tokens to output
--repetitions <value> \ # Number of repititions to be made for the prompt.
--log_file <file_path> \ # A .log file underwhich we want to write the results.
--device <cpu/cuda/metal> \ # The device in which we want to benchmark.
--models_dir <path_to_models> # The directory in which AWQ model weights are present
```

To get started quickly you can simply run:

```bash
./bench_autoawq/bench.sh -d cuda
```
This will take all the default values (see in the [bench.sh](/bench_autoawq/bench.sh) file) and do the benchmarks. You can find all the benchmarks results for AutoAWQ [here](/docs/llama2.md).


### 👀 Some points to note:

1. AutoAWQ is not supported devices other than GPU (only supports when CUDA is available).
2. We are independently benchmarking AutoAWQ (i.e. the actual AWQ quantization method here). We are not benchmarking with combinations like: AutoAWQ + VLLM or AutoAWQ + TensorRT.
3. For doing this benchmark, the default model that was choosen was: [Llama2-AutoAWQ by The Bloke](https://huggingface.co/TheBloke/Llama-2-7B-AWQ)
4. AutoAWQ does not support INT8 quantization properly yet. See [this issue](https://github.com/casper-hansen/AutoAWQ/issues/45).
15 changes: 6 additions & 9 deletions bench_autoawq/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@

class LlamaAutoAWQBenchmark:
def __init__(self, model_path: str, precision: int, device: str) -> None:
assert precision in ["fp16"], "For benchmarks supported precision is in FP16."
assert (
device == "cuda"
), "Since it's an optimization for FP-16, CPU not supported."
assert device == "cuda", "Device other than CUDA is not supported for autoawq."
assert precision == "int4", "Precison other than INT4 is not supported."

self.model_path, self.precision, self.device = (
model_path,
Expand Down Expand Up @@ -99,11 +97,10 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
)
report = defaultdict(lambda: defaultdict(float))

# Hardcoding precision to fp16 for AutoAWQ
precision = 16
precision = 4

if args.device == "cpu":
logging.info("Skipping running model on fp16 on CPU, not implemented for Half")
logging.info("Skipping running model on int4 on CPU, not implemented for Half")
pass
else:
logging.info(
Expand All @@ -112,15 +109,15 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
llama_autogptq_benchmark = LlamaAutoAWQBenchmark(
model_path=f"{args.models_dir}/llama-2-7b-autoawq",
device=args.device,
precision=f"fp{precision}",
precision=f"int{precision}",
).load_model()
llama_autogptq_benchmark.benchmark(
max_tokens=args.max_tokens,
prompt=args.prompt,
repetitions=args.repetitions,
)

report["Llama AutoAWQ"][f"FP-{precision}"] = {
report["Llama AutoAWQ"][f"INT-{precision}"] = {
"mean": np.mean(llama_autogptq_benchmark.results),
"std": np.std(llama_autogptq_benchmark.results),
}
Expand Down
44 changes: 29 additions & 15 deletions bench_autoawq/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,27 @@
#
# Usage: ./bench.sh [OPTIONS]
# OPTIONS:
# -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')
# -r, --repetitions Number of repetitions for benchmarks (default: 2)
# -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)
# -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
# -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
# -r, --repetitions Number of repetitions for benchmarks (default: 10)
# -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)
# -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
# -lf, --log_file Logging file name.
# -md, --models_dir Models directory.
# -h, --help Show this help message
########################################################################################################

set -euo pipefail

CURRENT_DIR="$(pwd)"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

print_usage() {
echo "Usage: $0 [OPTIONS]"
echo "OPTIONS:"
echo " -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')"
echo " -r, --repetitions Number of repetitions for benchmarks (default: 2)"
echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)"
echo " -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)"
echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)"
echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
echo " -lf, --log_file Logging file name."
echo " -md, --models_dir Models directory."
echo " -h, --help Show this help message"
Expand Down Expand Up @@ -69,6 +70,18 @@ check_python() {


setup() {

# Check if Logs folder exists else Make the logs folder
LOGS_FOLDER="$CURRENT_DIR/Logs"

if [ -d "$LOGS_FOLDER" ]; then
echo "Folder '$LOGS_FOLDER' already exists. Skipping."
else
# Create the folder
mkdir "$LOGS_FOLDER"
echo "'$LOGS_FOLDER' created."
fi

echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
bash "$SCRIPT_DIR"/setup.sh
}
Expand Down Expand Up @@ -142,15 +155,16 @@ while [ "$#" -gt 0 ]; do
esac
done

check_platform
check_python
setup

# Set default values if not provided
PROMPT="${PROMPT:-"Explain what is a transformer"}"
PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
REPETITIONS="${REPETITIONS:-10}"
MAX_TOKENS="${MAX_TOKENS:-100}"
DEVICE="${DEVICE:-'cpu'}"
LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
MAX_TOKENS="${MAX_TOKENS:-512}"
DEVICE="${DEVICE:-'cuda'}"
LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_autoawq_$(date +'%Y%m%d%H%M%S').log"}"
MODELS_DIR="${MODELS_DIR:-"./models"}"

check_platform
check_python
setup
run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
34 changes: 34 additions & 0 deletions bench_exllamav2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# ExLlamaV2

[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/turboderp/exllamav2) &nbsp;

[ExLlamaV2](https://github.com/turboderp/exllamav2) uses custom Kernels to speed up LLM inference under different quantizations. ExLlamaV2 supports a new "EXL2" format. EXL2 is based on the same optimization method as GPTQ and supports 2, 3, 4, 5, 6 and 8-bit quantization. For this benchmark implementation, we use 4-bit and 8-bit quantization version of Llama2.


### 🚀 Running the ExLlamaV2 Benchmark.

You can run the ExLlamaV2 benchmark using the following command:

```bash
./bench_exllamav2/bench.sh \
--prompt <value> \ # Enter a prompt string
--max_tokens <value> \ # Maximum number of tokens to output
--repetitions <value> \ # Number of repititions to be made for the prompt.
--log_file <file_path> \ # A .log file underwhich we want to write the results.
--device <cpu/cuda/metal> \ # The device in which we want to benchmark.
--models_dir <path_to_models> # The directory in which model weights are present
```

To get started quickly you can simply run:

```bash
./bench_exllamav2/bench.sh -d cuda
```
This will take all the default values (see in the [bench.sh](/bench_exllamav2/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for ExLlamaV2 [here](/docs/llama2.md).


### 👀 Some points to note:

1. ExLlamaV2 supports quantized LLMs. So Float32/16 is not supported here.
2. ExLlamaV2 currently [does not have support](https://github.com/turboderp/exllamav2/issues/184) for Mac/Metal.
3. Although it supports CPU, but it is too slow to offload and run. So we did not include in our benchmarks.
46 changes: 16 additions & 30 deletions bench_exllamav2/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import sys
import time
from collections import defaultdict
from dataclasses import dataclass

import numpy as np
import torch
from exllamav2 import ExLlamaV2Cache, model_init
from exllamav2 import ExLlamaV2, ExLlamaV2Cache
from exllamav2.config import ExLlamaV2Config
from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
from exllamav2.tokenizer import ExLlamaV2Tokenizer

logging.getLogger("llama_cpp").setLevel(logging.ERROR)
logging.basicConfig(
Expand All @@ -18,49 +19,34 @@
)


@dataclass
class ExtraConfig:
model_dir: str
length: int = 2048
rope_scale: float = 1.0
rope_alpha: float = 1.0
no_flash_attn: bool = False
low_mem: bool = False
gpu_split: str = None


class ExllamaV2Benchmark:
def __init__(self, model_path: str) -> None:
self.model_path = model_path
self.cache = None
self.results = []
self.model_path, self.results = model_path, []

def load_model(self):
self.model, self.tokenizer = model_init.init(
ExtraConfig(model_dir=self.model_path), allow_auto_split=True
)
self.config = ExLlamaV2Config()
self.config.model_dir = self.model_path
self.config.prepare()

self.model = ExLlamaV2(self.config)
self.cache = ExLlamaV2Cache(self.model, lazy=True)
self.model.load_autosplit(self.cache)
self.tokenizer = ExLlamaV2Tokenizer(self.config)

self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
self.settings = ExLlamaV2Sampler.Settings()
self.settings.temperature = 0.85
self.settings.top_k = 50
self.settings.top_p = 0.8
self.settings.token_repetition_penalty = 1.15

if not self.model.loaded:
self.cache = ExLlamaV2Cache(self.model)
self.model.load_autosplit(self.cache)
self.cache = None
self.cache = ExLlamaV2Cache(self.model)
self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
self.settings.token_repetition_penalty = 1.05
self.settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
self.generator.warmup()
return self

@torch.inference_mode()
def run_model(self, prompt: str, max_tokens: int) -> float:
start = time.time()
_ = self.generator.generate_simple(
prompt, self.settings, max_tokens, token_healing=True
)
_ = self.generator.generate_simple(prompt, self.settings, max_tokens, seed=1234)
delta = time.time() - start
return len(self.generator.sequence_ids[0]) / delta

Expand Down
Loading

0 comments on commit 747b6ca

Please sign in to comment.