diff --git a/README.md b/README.md index ac60881a..f1f30473 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,12 @@ Take a first glance of Llama-2-7B Model Performance Metrics Across Different Pre | tinygrad | - | 20.32 ± 0.06 | - | - | | onnx | - | 54.16 ± 3.15 | - | - | | transformers (pytorch) | 43.79 ± 0.61 | 46.39 ± 0.28 | 6.98 ± 0.05 | 21.72 ± 0.11 | -| vllm | 91.23 ± 3.61 | 91.44 ± 3.83 | - | 113.38 ± 11.70| -| exllamav2 | - | - | 116.91 ± 1.73 | 164.28 ± 4.07 | +| vllm | 90.78 ± 1.60 | 90.54 ± 2.22 | - | - | +| exllamav2 | - | - | 121.63 ± 0.74 | 130.16 ± 0.35 | | ctransformers | - | - | 76.75 ± 10.36 | 84.26 ± 5.79 | | AutoGPTQ | 42.01 ± 1.03 | 30.24 ± 0.41 | - | - | -| AutoAWQ | - | - | - | 116.94 ± 13.14| -| DeepSpeed | - | 83.54 ± 5.25 | - | | +| AutoAWQ | - | - | - | 109.20 ± 3.28 | +| DeepSpeed | - | 81.44 ± 8.13 | - | | | PyTorch Lightning | 24.85 ± 0.07 | 44.56 ± 2.89 | 10.50 ± 0.12 | 24.83 ± 0.05 | | Optimum Nvidia | 110.36 ± 0.52| 109.09 ± 4.26 | - | - | | Nvidia TensorRT-LLM | 60.39 ± 0.62 | 101.94 ± 8.34 | - | - | diff --git a/bench_autoawq/README.md b/bench_autoawq/README.md new file mode 100644 index 00000000..1f556ed2 --- /dev/null +++ b/bench_autoawq/README.md @@ -0,0 +1,36 @@ +# AutoAWQ + +[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/casper-hansen/AutoAWQ)   +[![ArXiv](https://img.shields.io/badge/arXiv-%230170FE.svg?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2306.00978) + + +[AutoAWQ](https://github.com/casper-hansen/AutoAWQ) is a package that is a polished implemementation of the original work [llm-awq](https://github.com/mit-han-lab/llm-awq) from MIT. AWQ or Activation Aware Quantization is a quantization method which supports 4-bit quantization. It massively increases the inference throughput and decreases the memory requirement of the model at the same time. (For example, according to this [reference](https://huggingface.co/TheBloke/Llama-2-70B-Chat-AWQ), Llama2 70B requires 2 x 80 GB but with AutoAWQ it can be run on 1 x 48 GB GPU). You can learn more about AWQ on the research paper and the github implementations. + +### 🚀 Running the AutoAWQ Benchmark. + +You can run the AutoAWQ benchmark using the following command: + +```bash +./bench_autoawq/bench.sh \ + --prompt \ # Enter a prompt string + --max_tokens \ # Maximum number of tokens to output + --repetitions \ # Number of repititions to be made for the prompt. + --log_file \ # A .log file underwhich we want to write the results. + --device \ # The device in which we want to benchmark. + --models_dir # The directory in which AWQ model weights are present +``` + +To get started quickly you can simply run: + +```bash +./bench_autoawq/bench.sh -d cuda +``` +This will take all the default values (see in the [bench.sh](/bench_autoawq/bench.sh) file) and do the benchmarks. You can find all the benchmarks results for AutoAWQ [here](/docs/llama2.md). + + +### 👀 Some points to note: + +1. AutoAWQ is not supported devices other than GPU (only supports when CUDA is available). +2. We are independently benchmarking AutoAWQ (i.e. the actual AWQ quantization method here). We are not benchmarking with combinations like: AutoAWQ + VLLM or AutoAWQ + TensorRT. +3. For doing this benchmark, the default model that was choosen was: [Llama2-AutoAWQ by The Bloke](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) +4. AutoAWQ does not support INT8 quantization properly yet. See [this issue](https://github.com/casper-hansen/AutoAWQ/issues/45). diff --git a/bench_autoawq/bench.py b/bench_autoawq/bench.py index d57d5b4b..17fb6a66 100644 --- a/bench_autoawq/bench.py +++ b/bench_autoawq/bench.py @@ -19,10 +19,8 @@ class LlamaAutoAWQBenchmark: def __init__(self, model_path: str, precision: int, device: str) -> None: - assert precision in ["fp16"], "For benchmarks supported precision is in FP16." - assert ( - device == "cuda" - ), "Since it's an optimization for FP-16, CPU not supported." + assert device == "cuda", "Device other than CUDA is not supported for autoawq." + assert precision == "int4", "Precison other than INT4 is not supported." self.model_path, self.precision, self.device = ( model_path, @@ -99,11 +97,10 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None: ) report = defaultdict(lambda: defaultdict(float)) - # Hardcoding precision to fp16 for AutoAWQ - precision = 16 + precision = 4 if args.device == "cpu": - logging.info("Skipping running model on fp16 on CPU, not implemented for Half") + logging.info("Skipping running model on int4 on CPU, not implemented for Half") pass else: logging.info( @@ -112,7 +109,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None: llama_autogptq_benchmark = LlamaAutoAWQBenchmark( model_path=f"{args.models_dir}/llama-2-7b-autoawq", device=args.device, - precision=f"fp{precision}", + precision=f"int{precision}", ).load_model() llama_autogptq_benchmark.benchmark( max_tokens=args.max_tokens, @@ -120,7 +117,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None: repetitions=args.repetitions, ) - report["Llama AutoAWQ"][f"FP-{precision}"] = { + report["Llama AutoAWQ"][f"INT-{precision}"] = { "mean": np.mean(llama_autogptq_benchmark.results), "std": np.std(llama_autogptq_benchmark.results), } diff --git a/bench_autoawq/bench.sh b/bench_autoawq/bench.sh index 3c977e61..41c3c11f 100755 --- a/bench_autoawq/bench.sh +++ b/bench_autoawq/bench.sh @@ -6,10 +6,10 @@ # # Usage: ./bench.sh [OPTIONS] # OPTIONS: -# -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer') -# -r, --repetitions Number of repetitions for benchmarks (default: 2) -# -m, --max_tokens Maximum number of tokens for benchmarks (default: 100) -# -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu') +# -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') +# -r, --repetitions Number of repetitions for benchmarks (default: 10) +# -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) +# -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') # -lf, --log_file Logging file name. # -md, --models_dir Models directory. # -h, --help Show this help message @@ -17,15 +17,16 @@ set -euo pipefail +CURRENT_DIR="$(pwd)" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" print_usage() { echo "Usage: $0 [OPTIONS]" echo "OPTIONS:" - echo " -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')" - echo " -r, --repetitions Number of repetitions for benchmarks (default: 2)" - echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)" - echo " -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')" + echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" + echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" + echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" + echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" echo " -lf, --log_file Logging file name." echo " -md, --models_dir Models directory." echo " -h, --help Show this help message" @@ -69,6 +70,18 @@ check_python() { setup() { + + # Check if Logs folder exists else Make the logs folder + LOGS_FOLDER="$CURRENT_DIR/Logs" + + if [ -d "$LOGS_FOLDER" ]; then + echo "Folder '$LOGS_FOLDER' already exists. Skipping." + else + # Create the folder + mkdir "$LOGS_FOLDER" + echo "'$LOGS_FOLDER' created." + fi + echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." bash "$SCRIPT_DIR"/setup.sh } @@ -142,15 +155,16 @@ while [ "$#" -gt 0 ]; do esac done +check_platform +check_python +setup + # Set default values if not provided -PROMPT="${PROMPT:-"Explain what is a transformer"}" +PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" REPETITIONS="${REPETITIONS:-10}" -MAX_TOKENS="${MAX_TOKENS:-100}" -DEVICE="${DEVICE:-'cpu'}" -LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}" +MAX_TOKENS="${MAX_TOKENS:-512}" +DEVICE="${DEVICE:-'cuda'}" +LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_autoawq_$(date +'%Y%m%d%H%M%S').log"}" MODELS_DIR="${MODELS_DIR:-"./models"}" -check_platform -check_python -setup run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR" diff --git a/bench_exllamav2/README.md b/bench_exllamav2/README.md new file mode 100644 index 00000000..9c912591 --- /dev/null +++ b/bench_exllamav2/README.md @@ -0,0 +1,34 @@ +# ExLlamaV2 + +[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/turboderp/exllamav2)   + +[ExLlamaV2](https://github.com/turboderp/exllamav2) uses custom Kernels to speed up LLM inference under different quantizations. ExLlamaV2 supports a new "EXL2" format. EXL2 is based on the same optimization method as GPTQ and supports 2, 3, 4, 5, 6 and 8-bit quantization. For this benchmark implementation, we use 4-bit and 8-bit quantization version of Llama2. + + +### 🚀 Running the ExLlamaV2 Benchmark. + +You can run the ExLlamaV2 benchmark using the following command: + +```bash +./bench_exllamav2/bench.sh \ + --prompt \ # Enter a prompt string + --max_tokens \ # Maximum number of tokens to output + --repetitions \ # Number of repititions to be made for the prompt. + --log_file \ # A .log file underwhich we want to write the results. + --device \ # The device in which we want to benchmark. + --models_dir # The directory in which model weights are present +``` + +To get started quickly you can simply run: + +```bash +./bench_exllamav2/bench.sh -d cuda +``` +This will take all the default values (see in the [bench.sh](/bench_exllamav2/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for ExLlamaV2 [here](/docs/llama2.md). + + +### 👀 Some points to note: + +1. ExLlamaV2 supports quantized LLMs. So Float32/16 is not supported here. +2. ExLlamaV2 currently [does not have support](https://github.com/turboderp/exllamav2/issues/184) for Mac/Metal. +3. Although it supports CPU, but it is too slow to offload and run. So we did not include in our benchmarks. diff --git a/bench_exllamav2/bench.py b/bench_exllamav2/bench.py index da44e370..3e5029bb 100644 --- a/bench_exllamav2/bench.py +++ b/bench_exllamav2/bench.py @@ -3,12 +3,13 @@ import sys import time from collections import defaultdict -from dataclasses import dataclass import numpy as np import torch -from exllamav2 import ExLlamaV2Cache, model_init +from exllamav2 import ExLlamaV2, ExLlamaV2Cache +from exllamav2.config import ExLlamaV2Config from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler +from exllamav2.tokenizer import ExLlamaV2Tokenizer logging.getLogger("llama_cpp").setLevel(logging.ERROR) logging.basicConfig( @@ -18,39 +19,26 @@ ) -@dataclass -class ExtraConfig: - model_dir: str - length: int = 2048 - rope_scale: float = 1.0 - rope_alpha: float = 1.0 - no_flash_attn: bool = False - low_mem: bool = False - gpu_split: str = None - - class ExllamaV2Benchmark: def __init__(self, model_path: str) -> None: - self.model_path = model_path - self.cache = None - self.results = [] + self.model_path, self.results = model_path, [] def load_model(self): - self.model, self.tokenizer = model_init.init( - ExtraConfig(model_dir=self.model_path), allow_auto_split=True - ) + self.config = ExLlamaV2Config() + self.config.model_dir = self.model_path + self.config.prepare() + + self.model = ExLlamaV2(self.config) + self.cache = ExLlamaV2Cache(self.model, lazy=True) + self.model.load_autosplit(self.cache) + self.tokenizer = ExLlamaV2Tokenizer(self.config) + + self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer) self.settings = ExLlamaV2Sampler.Settings() self.settings.temperature = 0.85 self.settings.top_k = 50 self.settings.top_p = 0.8 - self.settings.token_repetition_penalty = 1.15 - - if not self.model.loaded: - self.cache = ExLlamaV2Cache(self.model) - self.model.load_autosplit(self.cache) - self.cache = None - self.cache = ExLlamaV2Cache(self.model) - self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer) + self.settings.token_repetition_penalty = 1.05 self.settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id]) self.generator.warmup() return self @@ -58,9 +46,7 @@ def load_model(self): @torch.inference_mode() def run_model(self, prompt: str, max_tokens: int) -> float: start = time.time() - _ = self.generator.generate_simple( - prompt, self.settings, max_tokens, token_healing=True - ) + _ = self.generator.generate_simple(prompt, self.settings, max_tokens, seed=1234) delta = time.time() - start return len(self.generator.sequence_ids[0]) / delta diff --git a/bench_exllamav2/bench.sh b/bench_exllamav2/bench.sh index 8c179741..33c56fdc 100755 --- a/bench_exllamav2/bench.sh +++ b/bench_exllamav2/bench.sh @@ -1,17 +1,15 @@ #!/bin/bash -#!/bin/bash - ######################################################################################################## # Script: bench.sh -# Description: This script runs benchmarks llama.cpp llama benchmark. +# Description: This script runs benchmarks Exllamav2 Llama-2 benchmark. # # Usage: ./bench.sh [OPTIONS] # OPTIONS: -# -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer') -# -r, --repetitions Number of repetitions for benchmarks (default: 2) -# -m, --max_tokens Maximum number of tokens for benchmarks (default: 100) -# -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu') +# -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') +# -r, --repetitions Number of repetitions for benchmarks (default: 10) +# -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) +# -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') # -lf, --log_file Logging file name. # -md, --models_dir Models directory. # -h, --help Show this help message @@ -19,15 +17,16 @@ set -euo pipefail +CURRENT_DIR="$(pwd)" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" print_usage() { echo "Usage: $0 [OPTIONS]" echo "OPTIONS:" - echo " -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')" - echo " -r, --repetitions Number of repetitions for benchmarks (default: 2)" - echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)" - echo " -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')" + echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" + echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" + echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" + echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" echo " -lf, --log_file Logging file name." echo " -md, --models_dir Models directory." echo " -h, --help Show this help message" @@ -59,16 +58,29 @@ check_platform() { } check_python() { - if command -v python &> /dev/null - then - echo -e "\nUsing $(python --version)." + if command -v python &> /dev/null; then + PYTHON_CMD="python" + elif command -v python3 &> /dev/null; then + PYTHON_CMD="python3" else - echo -e "\nPython does not exist." + echo "Python is not installed." exit 1 fi } setup() { + + # Check if Logs folder exists else Make the logs folder + LOGS_FOLDER="$CURRENT_DIR/Logs" + + if [ -d "$LOGS_FOLDER" ]; then + echo "Folder '$LOGS_FOLDER' already exists. Skipping." + else + # Create the folder + mkdir "$LOGS_FOLDER" + echo "'$LOGS_FOLDER' created." + fi + echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." bash "$SCRIPT_DIR"/setup.sh } @@ -83,7 +95,7 @@ run_benchmarks() { # shellcheck disable=SC1091 source "$SCRIPT_DIR/venv/bin/activate" - python "$SCRIPT_DIR"/bench.py \ + "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ --prompt "$PROMPT" \ --repetitions "$REPETITIONS" \ --max_tokens "$MAX_TOKENS" \ @@ -143,17 +155,16 @@ while [ "$#" -gt 0 ]; do esac done +check_platform +check_python +setup # Set default values if not provided -PROMPT="${PROMPT:-"Explain what is a transformer"}" +PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" REPETITIONS="${REPETITIONS:-10}" -MAX_TOKENS="${MAX_TOKENS:-100}" +MAX_TOKENS="${MAX_TOKENS:-512}" DEVICE="${DEVICE:-'cuda'}" -LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}" +LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_exllamav2_$(date +'%Y%m%d%H%M%S').log"}" MODELS_DIR="${MODELS_DIR:-"./models"}" -check_platform -check_cuda -check_python -setup run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR" diff --git a/bench_exllamav2/requirements.txt b/bench_exllamav2/requirements.txt index 7aea264d..eae6bc03 100644 --- a/bench_exllamav2/requirements.txt +++ b/bench_exllamav2/requirements.txt @@ -1 +1,2 @@ exllamav2 +tqdm diff --git a/bench_exllamav2/setup.sh b/bench_exllamav2/setup.sh index ee872df8..cebc3787 100755 --- a/bench_exllamav2/setup.sh +++ b/bench_exllamav2/setup.sh @@ -6,12 +6,23 @@ # requirements. ################################################################################ +check_python() { + if command -v python &> /dev/null; then + PYTHON_CMD="python" + elif command -v python3 &> /dev/null; then + PYTHON_CMD="python3" + else + echo "Python is not installed." + exit 1 + fi +} + convert_bin_to_safetensor() { local HF_MODEL_FOLDER_PATH="$1" # shellcheck disable=SC1091 source "$SCRIPT_DIR/venv/bin/activate" - python "$SCRIPT_DIR"/convert.py \ + "$PYTHON_CMD" "$SCRIPT_DIR"/convert.py \ "$HF_MODEL_FOLDER_PATH" } @@ -41,7 +52,7 @@ convert_safetensor_to_exllamav2() { else mkdir -p "$EXLLAMA_WEIGHTS_FOLDER" echo "Going for conversion to exllamav2 format from .safetensors in $QUANTIZATION bit quantization." - python "$SCRIPT_DIR/exllamav2/convert.py" \ + "$PYTHON_CMD" "$SCRIPT_DIR/exllamav2/convert.py" \ -i "$HF_WEIGHTS_FOLDER" \ -o "$EXLLAMA_WEIGHTS_FOLDER" \ -c "$SCRIPT_DIR/wikitext-test.parquet" \ @@ -54,6 +65,8 @@ convert_safetensor_to_exllamav2() { } +check_python + CURRENT_DIR="$(pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -62,7 +75,7 @@ MODELS_DIR="${MODELS_DIR:-"models/llama-2-7b-hf"}" EXLLAMA_BASE_MODEL_DIR="${EXLLAMA_BASE_MODEL_DIR:-"./models/llama-2-7b-exllamav2"}" if [ ! -d "$VENV_DIR" ]; then - python -m venv "$VENV_DIR" + "$PYTHON_CMD" -m venv "$VENV_DIR" echo "Virtual environment '$VENV_DIR' created." # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" diff --git a/bench_lightning/README.md b/bench_lightning/README.md new file mode 100644 index 00000000..ce70d8bd --- /dev/null +++ b/bench_lightning/README.md @@ -0,0 +1,35 @@ +# Lightning + +[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Lightning-AI/lit-gpt)   + +[Lit-GPT](https://github.com/Lightning-AI/lit-gpt) is a hackable implementation of [different Open Source LLMs](https://github.com/Lightning-AI/lit-gpt?tab=readme-ov-file#-lit-gpt-1). Lit-GPT is written using the [Lightning Fabric](https://lightning.ai/docs/fabric/stable/) framework. Lightning Fabric is a fast and lightweight way to scale PyTorch models. It comes with features that enables to do distributed training and inference with ease. Lightning Fabric is based on [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/starter/introduction.html). + + +### 🚀 Running the Lightning Benchmark. + +You can run the Lightning benchmark using the following command: + +```bash +./bench_lightning/bench.sh \ + --prompt \ # Enter a prompt string + --max_tokens \ # Maximum number of tokens to output + --repetitions \ # Number of repititions to be made for the prompt. + --log_file \ # A .log file underwhich we want to write the results. + --device \ # The device in which we want to benchmark. + --models_dir # The directory in which model weights are present +``` + +To get started quickly you can simply run: + +```bash +./bench_lightning/bench.sh -d cuda +``` +This will take all the default values (see in the [bench.sh](/bench_lightning/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for Lightning [here](/docs/llama2.md). + + +### 👀 Some points to note: + +1. This implementation runs Llama-2-7B models. Lit-GPT model implementation requires converting HuggingFace models to lit-gpt formats. The model conversion can be found in the [setup.sh](/bench_lightning/setup.sh) file. +2. Since, running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So we would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights. +3. When running it on Metal devices, it runs out of memory. Therefore we were not able to do inference on Metal. +4. For CPU, it is just too much slow (> 1 minute). Hence we skipped the benchmarking for CPUs. diff --git a/bench_lightning/bench.sh b/bench_lightning/bench.sh index d9ace574..359a1d4a 100755 --- a/bench_lightning/bench.sh +++ b/bench_lightning/bench.sh @@ -2,14 +2,14 @@ ######################################################################################################## # Script: bench.sh -# Description: This script runs benchmarks llama.cpp llama benchmark. +# Description: This script runs benchmarks LightningAI Lit-GPT llama benchmark. # # Usage: ./bench.sh [OPTIONS] # OPTIONS: -# -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer') -# -r, --repetitions Number of repetitions for benchmarks (default: 2) -# -m, --max_tokens Maximum number of tokens for benchmarks (default: 100) -# -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu') +# -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') +# -r, --repetitions Number of repetitions for benchmarks (default: 10) +# -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) +# -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') # -lf, --log_file Logging file name. # -md, --models_dir Models directory. # -h, --help Show this help message @@ -17,15 +17,16 @@ set -euo pipefail +CURRENT_DIR="$(pwd)" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" print_usage() { echo "Usage: $0 [OPTIONS]" echo "OPTIONS:" - echo " -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')" - echo " -r, --repetitions Number of repetitions for benchmarks (default: 2)" - echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)" - echo " -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')" + echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" + echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" + echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" + echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" echo " -lf, --log_file Logging file name." echo " -md, --models_dir Models directory." echo " -h, --help Show this help message" @@ -69,6 +70,17 @@ check_python() { setup() { + # Check if Logs folder exists else Make the logs folder + LOGS_FOLDER="$CURRENT_DIR/Logs" + + if [ -d "$LOGS_FOLDER" ]; then + echo "Folder '$LOGS_FOLDER' already exists. Skipping." + else + # Create the folder + mkdir "$LOGS_FOLDER" + echo "'$LOGS_FOLDER' created." + fi + echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." bash "$SCRIPT_DIR"/setup.sh } @@ -141,15 +153,16 @@ while [ "$#" -gt 0 ]; do esac done +check_platform +check_python +setup + # Set default values if not provided -PROMPT="${PROMPT:-"Explain what is a transformer"}" +PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" REPETITIONS="${REPETITIONS:-10}" -MAX_TOKENS="${MAX_TOKENS:-100}" -DEVICE="${DEVICE:-'cpu'}" -LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}" +MAX_TOKENS="${MAX_TOKENS:-512}" +DEVICE="${DEVICE:-'cuda'}" +LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_lightning_$(date +'%Y%m%d%H%M%S').log"}" MODELS_DIR="${MODELS_DIR:-"./models"}" -check_platform -check_python -setup run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR" diff --git a/docs/llama2.md b/docs/llama2.md index a5ac2fa1..0277640a 100644 --- a/docs/llama2.md +++ b/docs/llama2.md @@ -18,12 +18,12 @@ | tinygrad | - | 20.32 ± 0.06 | - | - | | onnx | - | 54.16 ± 3.15 | - | - | | transformers (pytorch) | 43.79 ± 0.61 | 46.39 ± 0.28 | 6.98 ± 0.05 | 21.72 ± 0.11 | -| vllm | 91.23 ± 3.61 | 91.44 ± 3.83 | - | 113.38 ± 11.70| -| exllamav2 | - | - | 116.91 ± 1.73 | 164.28 ± 4.07 | +| vllm | 90.78 ± 1.60 | 90.54 ± 2.22 | - | - | +| exllamav2 | - | - | 121.63 ± 0.74 | 130.16 ± 0.35 | | ctransformers | - | - | 76.75 ± 10.36 | 84.26 ± 5.79 | | AutoGPTQ | 42.01 ± 1.03 | 30.24 ± 0.41 | - | - | -| AutoAWQ | - | - | - | 116.94 ± 13.14| -| DeepSpeed | - | 83.54 ± 5.25 | - | | +| AutoAWQ | - | - | - | 109.20 ± 3.28 | +| DeepSpeed | - | 81.44 ± 8.13 | - | | | PyTorch Lightning | 24.85 ± 0.07 | 44.56 ± 2.89 | 10.50 ± 0.12 | 24.83 ± 0.05 | | Optimum Nvidia | 110.36 ± 0.52| 109.09 ± 4.26 | - | - | | Nvidia TensorRT-LLM | 60.39 ± 0.62 | 101.94 ± 8.34 | - | - | diff --git a/docs/llama2.md.template b/docs/llama2.md.template index 1c1f6950..bd99774a 100644 --- a/docs/llama2.md.template +++ b/docs/llama2.md.template @@ -18,12 +18,12 @@ | tinygrad | - | 20.32 ± 0.06 | - | - | | onnx | - | 54.16 ± 3.15 | - | - | | transformers (pytorch) | 43.79 ± 0.61 | 46.39 ± 0.28 | 6.98 ± 0.05 | 21.72 ± 0.11 | -| vllm | 91.23 ± 3.61 | 91.44 ± 3.83 | - | 113.38 ± 11.70| -| exllamav2 | - | - | 116.91 ± 1.73 | 164.28 ± 4.07 | +| vllm | 90.78 ± 1.60 | 90.54 ± 2.22 | - | - | +| exllamav2 | - | - | 121.63 ± 0.74 | 130.16 ± 0.35 | | ctransformers | - | - | 76.75 ± 10.36 | 84.26 ± 5.79 | | AutoGPTQ | 42.01 ± 1.03 | 30.24 ± 0.41 | - | - | -| AutoAWQ | - | - | - | 116.94 ± 13.14| -| DeepSpeed | - | 83.54 ± 5.25 | - | | +| AutoAWQ | - | - | - | 109.20 ± 3.28 | +| DeepSpeed | - | 81.44 ± 8.13 | - | | | PyTorch Lightning | 24.85 ± 0.07 | 44.56 ± 2.89 | 10.50 ± 0.12 | 24.83 ± 0.05 | | Optimum Nvidia | 110.36 ± 0.52| 109.09 ± 4.26 | - | - | | Nvidia TensorRT-LLM | 60.39 ± 0.62 | 101.94 ± 8.34 | - | - |