From 4fdf5dfdb8a0b581b32eb1c8e8027fc736d8d490 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 00:14:46 +0530
Subject: [PATCH 01/59] Added readme for burn

---
 bench_burn/README.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 bench_burn/README.md
diff --git a/bench_burn/README.md b/bench_burn/README.md
new file mode 100644
index 00000000..5a63ef4b
--- /dev/null
+++ b/bench_burn/README.md
@@ -0,0 +1,33 @@
+# Burn
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Gadersd/llama2-burn) &nbsp;
+
+We use [Llama2-Burn project](https://github.com/Gadersd/llama2-burn), which provides a port pf the Llama2 model to [Burn](https://github.com/tracel-ai/burn). Burn is the DeepLearning Framework for Rust, which provides similar concepts and interfaces like PyTorch.
+
+
+### 🚀 Running the Burn Benchmark.
+
+For running this benchmark, make sure you have [Rust installed](https://www.rust-lang.org/tools/install). You can run the Burn benchmark using the following command:
+
+```bash
+./bench_burn/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which AWQ model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_burn/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_burn/bench.sh) file) and do the benchmarks. You can find all the benchmarks results for Burn [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. For CUDA and Metal, Burn runs for only Float32 precision.
+2. You need to download weights of LLama-2 7B from HuggingFace. This repo already does it. However it assumes that you already have accepted the [terms and condition](https://huggingface.co/meta-llama/Llama-2-7b-hf) before running or downloading the model and runnning this benchmark.

From 3efffea7f02479bd7351082c2dbfabd47ae98e65 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 12:51:08 +0530
Subject: [PATCH 02/59] added readme for candle

---
 bench_candle/README.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 bench_candle/README.md

diff --git a/bench_candle/README.md b/bench_candle/README.md
new file mode 100644
index 00000000..dd8bd7a3
--- /dev/null
+++ b/bench_candle/README.md
@@ -0,0 +1,33 @@
+# Candle
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/huggingface/candle) &nbsp;
+
+[Candle](https://github.com/huggingface/candle) is a minimalistic Machine /Deep Learning framework written on Rust by [huggingface](https://github.com/huggingface). It tries to provide a simpler interface to implement models along with GPU support. This is a modified implementation of [Llama2-Candle example](https://github.com/huggingface/candle/blob/main/candle-examples/examples/llama/main.rs) to analyse the benchmark performance across different devices and precision.
+
+
+### 🚀 Running the Candle Benchmark.
+
+For running this benchmark, make sure you have [Rust installed](https://www.rust-lang.org/tools/install). You can run the Candle benchmark using the following command:
+
+```bash
+./bench_candle/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which AWQ model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_candle/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_candle/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for Candle [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. Candle does not support Float32 from the latest implementation. This imlementation of Candle Llama2 does not support quantized weights of int8/4 precisions.
+2. Candle does not have support for Metal devices.

From 118de441af1b17e3a8c28dff90b0447e55731406 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 23 Jan 2024 14:48:29 +0530
Subject: [PATCH 03/59] Update bench_candle/README.md

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_candle/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_candle/README.md b/bench_candle/README.md
index dd8bd7a3..b84e3208 100644
--- a/bench_candle/README.md
+++ b/bench_candle/README.md
@@ -16,7 +16,7 @@ For running this benchmark, make sure you have [Rust installed](https://www.rust
   --repetitions <value> \       # Number of repititions to be made for the prompt.
   --log_file <file_path> \      # A .log file underwhich we want to write the results.
   --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
-  --models_dir <path_to_models> # The directory in which AWQ model weights are present
+  --models_dir <path_to_models> # The directory in which model weights are present
 ```
 
 To get started quickly you can simply run:

From 9b5275ce6478a9ebbde63f30bd20e7e620055a3e Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 23 Jan 2024 14:48:34 +0530
Subject: [PATCH 04/59] Update bench_candle/README.md

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_candle/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_candle/README.md b/bench_candle/README.md
index b84e3208..5a7589cd 100644
--- a/bench_candle/README.md
+++ b/bench_candle/README.md
@@ -29,5 +29,5 @@ This will take all the default values (see in the [bench.sh](/bench_candle/bench
 
 ### 👀 Some points to note:
 
-1. Candle does not support Float32 from the latest implementation. This imlementation of Candle Llama2 does not support quantized weights of int8/4 precisions.
+1. Candle does not support Float32 from the latest implementation. This implementation of Candle Llama2 does not support quantized weights of int8/4 precisions.
 2. Candle does not have support for Metal devices.

From 1e3b36aa906dddcc92bb3806a615c013b91b8bfc Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 23 Jan 2024 14:53:21 +0530
Subject: [PATCH 05/59] Update bench_burn/README.md

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_burn/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_burn/README.md b/bench_burn/README.md
index 5a63ef4b..ce66a773 100644
--- a/bench_burn/README.md
+++ b/bench_burn/README.md
@@ -2,7 +2,7 @@
 
 [![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Gadersd/llama2-burn) &nbsp;
 
-We use [Llama2-Burn project](https://github.com/Gadersd/llama2-burn), which provides a port pf the Llama2 model to [Burn](https://github.com/tracel-ai/burn). Burn is the DeepLearning Framework for Rust, which provides similar concepts and interfaces like PyTorch.
+We use [Llama2-Burn project](https://github.com/Gadersd/llama2-burn), which provides a port of the Llama2 model to [Burn](https://github.com/tracel-ai/burn). Burn is the DeepLearning Framework for Rust, which provides similar concepts and interfaces like PyTorch.
 
 
 ### 🚀 Running the Burn Benchmark.

From 501c88ccfc180f642c86ccfe52a2b3b37c4b868c Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 23 Jan 2024 14:53:28 +0530
Subject: [PATCH 06/59] Update bench_burn/README.md

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_burn/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_burn/README.md b/bench_burn/README.md
index ce66a773..a1a45217 100644
--- a/bench_burn/README.md
+++ b/bench_burn/README.md
@@ -16,7 +16,7 @@ For running this benchmark, make sure you have [Rust installed](https://www.rust
   --repetitions <value> \       # Number of repititions to be made for the prompt.
   --log_file <file_path> \      # A .log file underwhich we want to write the results.
   --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
-  --models_dir <path_to_models> # The directory in which AWQ model weights are present
+  --models_dir <path_to_models> # The directory in which model weights are present
 ```
 
 To get started quickly you can simply run:

From 485358440fb92a4383a36ac83d4e17aeba978b0e Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 22:52:44 +0530
Subject: [PATCH 07/59] minor typo fix

---
 bench_onnxruntime/bench.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bench_onnxruntime/bench.py b/bench_onnxruntime/bench.py
index 4264de26..a6cc23a9 100644
--- a/bench_onnxruntime/bench.py
+++ b/bench_onnxruntime/bench.py
@@ -54,7 +54,9 @@ def benchmark(self, prompt, max_tokens, repetitions):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="llama.cpp Benchmark Llama model.")
+    parser = argparse.ArgumentParser(
+        description="ONXX Runtime Benchmark for Llama model."
+    )
     parser.add_argument(
         "--prompt",
         type=str,

From 5e4593c76662e9e72e3b5d696cb2b89ec5f802e3 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 22:53:02 +0530
Subject: [PATCH 08/59] Readme for ONNXRuntime

---
 bench_onnxruntime/README.md | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 bench_onnxruntime/README.md

diff --git a/bench_onnxruntime/README.md b/bench_onnxruntime/README.md
new file mode 100644
index 00000000..3755ae46
--- /dev/null
+++ b/bench_onnxruntime/README.md
@@ -0,0 +1,34 @@
+# ONNX Runtime
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/ggerganov/llama.cpp) &nbsp;
+
+
+[ONNX (Open Neural Network Exchange) Runtime](https://github.com/microsoft/onnxruntime) is an open-source, cross-platform runtime that enables efficient execution of neural network models trained in various frameworks, promoting interoperability and flexibility in deploying machine learning models. This benchmark implementation uses [HuggingFace Optimum](https://github.com/huggingface/optimum) which supports models running under ONNX Runtime.
+
+### 🚀 Running the ONNX Runtime Benchmark.
+
+You can run the ONNX Runtime  benchmark using the following command:
+
+```bash
+./bench_onnxruntime/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_onnxruntime/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_onnxruntime/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for ONNX Runtime [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. ONNX Runtime requires HuggingFace Llama2-7B weights. And it converts those weights into ONNX format using this [setup.sh](/bench_onnxruntime/setup.sh) script. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
+2. ONNX Runtime GPU only support Float16 precision format.
+3. Running LLama 2 using ONNX Runtime in CPU/Metal is too memory intensive, so benchmarking is skipped for those.

From 120ebac082b6ae9bccf72eadb9642d81bd34d026 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 23:13:25 +0530
Subject: [PATCH 09/59] removed exceptions for optimum-nvidia

---
 docs/llama2.md.template | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 985a2546..f684ddb7 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -79,6 +79,5 @@
 *Note: AutoAWQ is not supported devices other than GPU (only supports when CUDA is available).
 *Note: Pytorch Lightning runs out of memory in metal (out of 18 GB) so benchmark not available.
 *Note: CPU/Metal is not supported right now. Support for CPU is on [developement](https://github.com/vllm-project/vllm/pull/1028). No developement for metal so far.
-*Note: Optimum Nvidia only supports CUDA right now. Also it supports float 16/32 as precision. It additionally supports FP-8 precision. We do not add this, just to keep everything same for all other candidates.
 *Note: DeepSpeed inference is not supported for Metal/CPU devices. Also, it only works for fp-16 precision.
 *Note: Nvidia TensorRT LLM only supports for CUDA.

From 2d050eb7a051b21c5285105dc3da6203fd9f7a4c Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 23:13:40 +0530
Subject: [PATCH 10/59] Added readme for optimum nvidia

---
 bench_optimum_nvidia/README.md | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 bench_optimum_nvidia/README.md

diff --git a/bench_optimum_nvidia/README.md b/bench_optimum_nvidia/README.md
new file mode 100644
index 00000000..1f89444a
--- /dev/null
+++ b/bench_optimum_nvidia/README.md
@@ -0,0 +1,34 @@
+# Optimum-Nvidia
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/huggingface/optimum-nvidia) &nbsp;
+
+[Optimum-Nvidia](https://github.com/huggingface/optimum-nvidia) is a Large Language Model inference library developed by HuggingFace. It leverages the advanced compilation capabilities of [Nvidia's TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) to enhance performance. The library specifically takes advantage of the Float8 format supported on Nvidia's Ada Lovelace and Hopper architectures. It's worth noting that benchmarking for Float8 is not currently included in this implementation, as it is not widely supported in other inference engines or providers.
+
+### 🚀 Running the Optimum-Nvidia Benchmark.
+
+Before running this benchmark, make sure you have Docker installed. You can run the Optimum-Nvidia  benchmark using the following command:
+
+```bash
+./bench_optimum_nvidia/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_optimum_nvidia/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_optimum_nvidia/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for Optimum-Nvidia [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. Optimum Nvidia although supports quantized versions, but it uses AutoGPTQ under the hood. Since we wanted to do independent benchmarking, so we skipped it for now.
+2. Optimum Nvidia uses Docker to convert the models into a specific engine format. You can find the weight conversion logic under [setup.sh](/bench_optimum_nvidia/setup.sh) file.
+3. Optimum Nvidia only supports CUDA.
+4. Optimum Nvidia requires HuggingFace Llama2-7B weights. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.

From f340571ec81841e501843b43dc707a9460488bcc Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 23:40:11 +0530
Subject: [PATCH 11/59] removed tensorrt llm exception inside
 llama2.md.template

---
 docs/llama2.md.template | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 985a2546..db304fbb 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -79,6 +79,4 @@
 *Note: AutoAWQ is not supported devices other than GPU (only supports when CUDA is available).
 *Note: Pytorch Lightning runs out of memory in metal (out of 18 GB) so benchmark not available.
 *Note: CPU/Metal is not supported right now. Support for CPU is on [developement](https://github.com/vllm-project/vllm/pull/1028). No developement for metal so far.
-*Note: Optimum Nvidia only supports CUDA right now. Also it supports float 16/32 as precision. It additionally supports FP-8 precision. We do not add this, just to keep everything same for all other candidates.
 *Note: DeepSpeed inference is not supported for Metal/CPU devices. Also, it only works for fp-16 precision.
-*Note: Nvidia TensorRT LLM only supports for CUDA.

From 5ad99551ce2c1ad45cc191ca8e95f8f92cff23eb Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 23 Jan 2024 23:40:44 +0530
Subject: [PATCH 12/59] Added Readme for Tensorrt-LLM Benchmark

---
 bench_tensorrtllm/README.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 bench_tensorrtllm/README.md

diff --git a/bench_tensorrtllm/README.md b/bench_tensorrtllm/README.md
new file mode 100644
index 00000000..d4d59cbc
--- /dev/null
+++ b/bench_tensorrtllm/README.md
@@ -0,0 +1,33 @@
+# TensorRT-LLM
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/NVIDIA/TensorRT-LLM) &nbsp;
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is a Python library that facilitates the creation and optimization of Large Language Models (LLMs) for efficient inference on NVIDIA GPUs. TensorRT-LLM supports various quantization modes, including INT4 and INT8 weights, along with FP16 activations, allowing users to maximize performance and minimize memory usage. It also provides pre-defined models that can be easily customized and extended to meet specific requirements, and it integrates with the [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) for production deployment.
+
+### 🚀 Running the TensorRT-LLM Benchmark.
+
+Running TensorRT-LLM requires Docker. So make sure you have installed Docker. You can run the TensorRT-LLM  benchmark using the following command:
+
+```bash
+./bench_tensorrt_llm/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_tensorrt_llm/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_tensorrt_llm/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for TensorRT-LLM [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. Running this benchmark requires HuggingFace Llama2-7B weights. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
+2. TensorRT LLM only works with CUDA. So it does not support Metal/CPU.
+3. Although TensorRT supports INT4/8 quantization, but it uses AutoGPTQ under the hood. Since we wanted to keep our implementation independent, so we skipped it.

From 3a102a4d16c1592247cccd6dffb1d502412cd792 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 24 Jan 2024 00:09:34 +0530
Subject: [PATCH 13/59] Add readme for tinygrad benchmarking

---
 bench_tinygrad/README.md | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 bench_tinygrad/README.md

diff --git a/bench_tinygrad/README.md b/bench_tinygrad/README.md
new file mode 100644
index 00000000..3755ae46
--- /dev/null
+++ b/bench_tinygrad/README.md
@@ -0,0 +1,34 @@
+# ONNX Runtime
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/ggerganov/llama.cpp) &nbsp;
+
+
+[ONNX (Open Neural Network Exchange) Runtime](https://github.com/microsoft/onnxruntime) is an open-source, cross-platform runtime that enables efficient execution of neural network models trained in various frameworks, promoting interoperability and flexibility in deploying machine learning models. This benchmark implementation uses [HuggingFace Optimum](https://github.com/huggingface/optimum) which supports models running under ONNX Runtime.
+
+### 🚀 Running the ONNX Runtime Benchmark.
+
+You can run the ONNX Runtime  benchmark using the following command:
+
+```bash
+./bench_onnxruntime/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_onnxruntime/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_onnxruntime/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for ONNX Runtime [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. ONNX Runtime requires HuggingFace Llama2-7B weights. And it converts those weights into ONNX format using this [setup.sh](/bench_onnxruntime/setup.sh) script. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
+2. ONNX Runtime GPU only support Float16 precision format.
+3. Running LLama 2 using ONNX Runtime in CPU/Metal is too memory intensive, so benchmarking is skipped for those.

From 5aeba536ad87aac8152e5e67f6e191578283e77c Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 24 Jan 2024 00:11:53 +0530
Subject: [PATCH 14/59] Add readme for tinygrad benchmarking

---
 bench_tinygrad/README.md | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/bench_tinygrad/README.md b/bench_tinygrad/README.md
index 3755ae46..dbed1b26 100644
--- a/bench_tinygrad/README.md
+++ b/bench_tinygrad/README.md
@@ -1,16 +1,16 @@
-# ONNX Runtime
+# TinyGrad
 
-[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/ggerganov/llama.cpp) &nbsp;
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/tinygrad/tinygrad) &nbsp;
 
+TinyGrad is a minimalistic deep learning framework, very similar to [PyTorch](https://github.com/pytorch/pytorch). It's simplicity is inspired from the [micrograd](https://github.com/karpathy/micrograd) implementation by [Andrej Karpathy](https://karpathy.ai/). TinyGrad leverages uses different methods like lazy computation and kernel fusion techniques to run different operations. It supports various accelerators out of the box, including CPU, GPU etc. This benchmark implementation uses the [Llama 2 example](https://github.com/tinygrad/tinygrad/blob/master/examples/llama.py) written inside tinygrad/examples.
 
-[ONNX (Open Neural Network Exchange) Runtime](https://github.com/microsoft/onnxruntime) is an open-source, cross-platform runtime that enables efficient execution of neural network models trained in various frameworks, promoting interoperability and flexibility in deploying machine learning models. This benchmark implementation uses [HuggingFace Optimum](https://github.com/huggingface/optimum) which supports models running under ONNX Runtime.
 
-### 🚀 Running the ONNX Runtime Benchmark.
+### 🚀 Running the TinyGrad Benchmark.
 
-You can run the ONNX Runtime  benchmark using the following command:
+You can run the TinyGrad  benchmark using the following command:
 
 ```bash
-./bench_onnxruntime/bench.sh \
+./bench_tinygrad/bench.sh \
   --prompt <value> \            # Enter a prompt string
   --max_tokens <value> \        # Maximum number of tokens to output
   --repetitions <value> \       # Number of repititions to be made for the prompt.
@@ -22,13 +22,12 @@ You can run the ONNX Runtime  benchmark using the following command:
 To get started quickly you can simply run:
 
 ```bash
-./bench_onnxruntime/bench.sh -d cuda
+./bench_tinygrad/bench.sh -d cuda
 ```
-This will take all the default values (see in the [bench.sh](/bench_onnxruntime/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for ONNX Runtime [here](/docs/llama2.md).
+This will take all the default values (see in the [bench.sh](/bench_tinygrad/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for TinyGrad [here](/docs/llama2.md).
 
 
 ### 👀 Some points to note:
 
-1. ONNX Runtime requires HuggingFace Llama2-7B weights. And it converts those weights into ONNX format using this [setup.sh](/bench_onnxruntime/setup.sh) script. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
-2. ONNX Runtime GPU only support Float16 precision format.
-3. Running LLama 2 using ONNX Runtime in CPU/Metal is too memory intensive, so benchmarking is skipped for those.
+1. The current implementation of TinyGrad only supports Float16 for CUDA, CPU and Metal.
+2. This benchmark implementation expects the Raw Llama 2 weights from Meta AI to run LLama2 Model. So it assumes that you already accepted all the [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) before running it.

From a01b1bf5b779410870ea204697124c6c726459ee Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Mon, 29 Jan 2024 17:54:26 +0000
Subject: [PATCH 15/59] updated readme with more info

---
 bench_tensorrtllm/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bench_tensorrtllm/README.md b/bench_tensorrtllm/README.md
index d4d59cbc..c614e163 100644
--- a/bench_tensorrtllm/README.md
+++ b/bench_tensorrtllm/README.md
@@ -28,6 +28,5 @@ This will take all the default values (see in the [bench.sh](/bench_tensorrt_llm
 
 ### 👀 Some points to note:
 
-1. Running this benchmark requires HuggingFace Llama2-7B weights. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
+1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
 2. TensorRT LLM only works with CUDA. So it does not support Metal/CPU.
-3. Although TensorRT supports INT4/8 quantization, but it uses AutoGPTQ under the hood. Since we wanted to keep our implementation independent, so we skipped it.

From de7f7b08e00d03cf3a87b614922e029651b6a20c Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Mon, 29 Jan 2024 17:57:18 +0000
Subject: [PATCH 16/59] both logging and pring for tensorrt llm

---
 bench_tensorrtllm/bench.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/bench_tensorrtllm/bench.py b/bench_tensorrtllm/bench.py
index 1d7370a6..286fb1f4 100644
--- a/bench_tensorrtllm/bench.py
+++ b/bench_tensorrtllm/bench.py
@@ -1,5 +1,7 @@
 import argparse
 import json
+import logging
+import sys
 import time
 from collections import defaultdict
 from pathlib import Path
@@ -11,6 +13,18 @@
 from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 from transformers import AutoTokenizer
 
+logging.getLogger("ctranslate2").setLevel(logging.ERROR)
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+
+def log_and_print(message: str) -> None:
+    print(message)
+    logging.info(message)
+
 
 class LlamaTensorRTMBenchmark:
     def __init__(
@@ -112,7 +126,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             end_id=2, pad_id=2, num_beams=1, temperature=0.1
         )
         for i in range(repetitions):
-            print(
+            log_and_print(
                 f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]"
             )
             tokens_per_second = self.run_model(
@@ -150,14 +164,14 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
         help="Path to the models directory.",
     )
     args = parser.parse_args()
-    print(
+    log_and_print(
         f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} "
         + f"repetitions={args.repetitions} device={args.device}"
     )
     report = defaultdict(lambda: defaultdict(float))
 
     for precision in ("fp16", "fp32"):
-        print(
+        log_and_print(
             f"Running TensorRT LLM benchmark (pytorch backend) on Llama with precision: {precision}"
         )
         llama_tensorrt_benchmark = LlamaTensorRTMBenchmark(
@@ -171,16 +185,16 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
         )
 
-        report["llama_transformers_pytorch"][precision] = {
+        report["llama_transformers_tensorrt"][precision] = {
             "mean": np.mean(llama_tensorrt_benchmark.results),
             "std": np.std(llama_tensorrt_benchmark.results),
         }
 
-    print("Benchmark Report")
+    log_and_print("Benchmark Report")
     with open(args.log_file, "a") as file:
         for framework, quantizations in report.items():
             for quantization, stats in quantizations.items():
-                print(
+                log_and_print(
                     f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
                 )
                 print(

From 80106a88dbd26eb8ee92ffdf1ba3302f3d2d02f8 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Mon, 29 Jan 2024 17:58:00 +0000
Subject: [PATCH 17/59] Added logs folder and with updated cli args

---
 bench_tensorrtllm/bench.sh | 59 ++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/bench_tensorrtllm/bench.sh b/bench_tensorrtllm/bench.sh
index a118a4ba..2426f3c8 100755
--- a/bench_tensorrtllm/bench.sh
+++ b/bench_tensorrtllm/bench.sh
@@ -2,14 +2,14 @@
 
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks TensorRT Llama benchmark.
+# Description: This script runs benchmarks TensorRT-LLM Llama-2 benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
@@ -17,15 +17,16 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
-    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
@@ -58,16 +59,29 @@ check_platform() {
 }
 
 check_python() {
-    if command -v python &> /dev/null
-    then
-        echo -e "\nUsing $(python --version)."
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
     else
-        echo -e "\nPython does not exist."
+        echo "Python is not installed."
         exit 1
     fi
 }
 
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh
 }
@@ -82,7 +96,7 @@ run_benchmarks() {
     local MODELS_DIR="$6"
 
     # shellcheck disable=SC1091
-    python "$SCRIPT_DIR"/bench.py \
+    "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
         --prompt "$PROMPT" \
         --repetitions "$REPETITIONS" \
         --max_tokens "$MAX_TOKENS" \
@@ -141,18 +155,20 @@ while [ "$#" -gt 0 ]; do
             ;;
     esac
 done
-# Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
-REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
-MODELS_DIR="${MODELS_DIR:-"/mnt/models"}"
 
 check_platform
 check_python
 setup
 
+# Set default values if not provided
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
+REPETITIONS="${REPETITIONS:-10}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"/mnt/Logs/benchmark_ctranslate_$(date +'%Y%m%d%H%M%S').log"}"
+MODELS_DIR="${MODELS_DIR:-"/mnt/models"}"
+
+
 docker run \
     --gpus all \
     --ipc=host \
@@ -161,6 +177,7 @@ docker run \
     -e PYTHONUNBUFFERED=1 \
     -v "$(pwd)/models:/mnt/models" \
     -v "$SCRIPT_DIR:/mnt/scripts" \
+    -v "$LOGS_FOLDER:/mnt/Logs" \
     -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
     --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
     --env "CCACHE_BASEDIR=/code/tensorrt_llm" \

From 75a568858d6bd4ac92ee465af91c02368a61ee37 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Mon, 29 Jan 2024 18:03:57 +0000
Subject: [PATCH 18/59] Added INT8 support

---
 bench_tensorrtllm/setup.sh | 86 +++++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 20 deletions(-)

diff --git a/bench_tensorrtllm/setup.sh b/bench_tensorrtllm/setup.sh
index 3d3e8547..aa4479d2 100755
--- a/bench_tensorrtllm/setup.sh
+++ b/bench_tensorrtllm/setup.sh
@@ -21,7 +21,6 @@ check_docker() {
     fi
 }
 
-
 build_docker_image () {
     # Todo: might require to clone a Patched version.
     local repo_name="TensorRT-LLM"
@@ -57,27 +56,59 @@ build_and_compile_model () {
 
     local model_build_path_32="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_32"
     local model_build_path_16="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_16"
+    local model_build_path_08="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_08"
+    local model_build_path_04="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_04"
+
 
     if docker image inspect tensorrt_llm/release:latest &> /dev/null; then
         if [ ! -d "$model_build_path_32" ]; then
             mkdir -p "$model_build_path_32"
+            echo "Building model build (FP32 precision) with Docker..."
+            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+                --gpus=all \
+                -v "$CURRENT_DIR"/models:/models \
+                -v "$model_build_path_32":/tensorrt_nvidia_build_32 \
+                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+                --workdir /code/tensorrt_llm \
+                --hostname psqh4m1l0zhx-release \
+                --name tensorrt_llm-release-paperspace \
+                --tmpfs /tmp:exec \
+                tensorrt_llm/release:latest \
+                python3 ./examples/llama/build.py --model_dir /models/llama-2-7b-hf --dtype float32 --max_batch_size 1 --max_input_len 3000 --max_output_len 1024 --output_dir /tensorrt_nvidia_build_32
+        else
+            echo "Engine file for Llama 2 build FP32 already exists. Skipping ..."
         fi
 
         if [ ! -d "$model_build_path_16" ]; then
             mkdir -p "$model_build_path_16"
+            echo "Building model build (FP16 precision) with Docker..."
+            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+                --gpus=all \
+                -v "$CURRENT_DIR"/models:/models \
+                -v "$model_build_path_16":/tensorrt_nvidia_build_16 \
+                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+                --workdir /code/tensorrt_llm \
+                --hostname psqh4m1l0zhx-release \
+                --name tensorrt_llm-release-paperspace \
+                --tmpfs /tmp:exec \
+                tensorrt_llm/release:latest \
+                python3 ./examples/llama/build.py --model_dir /models/llama-2-7b-hf --dtype float16 --max_batch_size 1 --max_input_len 3000 --max_output_len 1024 --output_dir /tensorrt_nvidia_build_16
+        else
+            echo "Engine file for Llama 2 build FP16 already exists. Skipping ..."
         fi
 
-        if [ -z "$(ls -A "$model_build_path_32")" ] || [ -z "$(ls -A "$model_build_path_16")" ]; then
-            echo "Building model with Docker..."
+        if [ ! -d "$model_build_path_08" ]; then
+            mkdir -p "$model_build_path_08"
+            echo "Generating binaries for each of the model layers"
 
             docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
                 --gpus=all \
-                --ipc=host \
-                --ulimit memlock=-1 \
-                --ulimit stack=67108864 \
                 -v "$CURRENT_DIR"/models:/models \
-                -v "$model_build_path_32":/tensorrt_nvidia_build_32 \
-                -v "$model_build_path_16":/tensorrt_nvidia_build_16 \
+                -v "$model_build_path_08":/tensorrt_nvidia_build_08 \
                 -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
                 --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
                 --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
@@ -86,16 +117,14 @@ build_and_compile_model () {
                 --name tensorrt_llm-release-paperspace \
                 --tmpfs /tmp:exec \
                 tensorrt_llm/release:latest \
-                python3 ./examples/llama/build.py --model_dir /models/llama-2-7b-hf --dtype float32 --max_batch_size 1 --max_input_len 3000 --max_output_len 1024 --output_dir /tensorrt_nvidia_build_32
+                python3 ./examples/llama/hf_llama_convert.py -i /models/llama-2-7b-hf -o /tensorrt_nvidia_build_08 --calibrate-kv-cache -t fp16 \
 
+        elif [ ! "$(find "$model_build_path_08" -maxdepth 1 | wc -l)" -gt 2 ]; then
+            echo "Building model build (FP08 precision) with Docker..."
             docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
                 --gpus=all \
-                --ipc=host \
-                --ulimit memlock=-1 \
-                --ulimit stack=67108864 \
                 -v "$CURRENT_DIR"/models:/models \
-                -v "$model_build_path_32":/tensorrt_nvidia_build_32 \
-                -v "$model_build_path_16":/tensorrt_nvidia_build_16 \
+                -v "$model_build_path_08":/tensorrt_nvidia_build_08 \
                 -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
                 --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
                 --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
@@ -104,15 +133,32 @@ build_and_compile_model () {
                 --name tensorrt_llm-release-paperspace \
                 --tmpfs /tmp:exec \
                 tensorrt_llm/release:latest \
-                python3 ./examples/llama/build.py --model_dir /models/llama-2-7b-hf --dtype float16 --max_batch_size 1 --max_input_len 3000 --max_output_len 1024 --output_dir /tensorrt_nvidia_build_16
-
-            echo "Model built successfully."
+                python3 ./examples/llama/build.py --bin_model_dir /tensorrt_nvidia_build_08/1-gpu --dtype float16 --use_gpt_attention_plugin float16 --use_gemm_plugin float16 --int8_kv_cache --output_dir /tensorrt_nvidia_build_08 --use_weight_only
         else
-            echo "Engine file already exists. Skipping build."
+            if [ -d "$model_build_path_08" ] && [ -d "$model_build_path_08/1-gpu" ]; then
+                echo "Engine file for Llama 2 build INT-8 already exists. Skipping ..."
+            else
+                echo "There is a problem with the model build directories. Please retry."
+            fi
+        fi
+
+        if [ ! -d "$model_build_path_04" ]; then
+            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+                --gpus=all \
+                -v "$CURRENT_DIR"/models:/models \
+                -v "$model_build_path_04":/tensorrt_nvidia_build_04 \
+                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+                --workdir /code/tensorrt_llm \
+                --hostname psqh4m1l0zhx-release \
+                --name tensorrt_llm-release-paperspace \
+                --tmpfs /tmp:exec \
+                tensorrt_llm/release:latest \
+                python3 ./examples/quantization/quantize.py --model_dir /models/llama-2-7b-hf --dtype float16 --qformat int4_awq --export_path /tensorrt_nvidia_build_04 --calib_size 32
         fi
     else
-        echo "The base image does not exist locally. Exiting..."
-        exit 1
+        echo "Docker image does not exist ... "
     fi
 }
 

From 33504021e71ef6515dfcb28b2328c497da5f1ea6 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Mon, 29 Jan 2024 18:38:10 +0000
Subject: [PATCH 19/59] Add INT8 support benchmark

---
 bench_tensorrtllm/bench.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/bench_tensorrtllm/bench.py b/bench_tensorrtllm/bench.py
index 286fb1f4..1afb4674 100644
--- a/bench_tensorrtllm/bench.py
+++ b/bench_tensorrtllm/bench.py
@@ -34,16 +34,16 @@ def __init__(
         precision: str,
         device: Optional[str] = "cuda",
     ) -> None:
-        assert precision in ["fp32", "fp16"], ValueError(
-            "Supported Precision: 'fp32' or 'fp16'"
-        )
+        # assert precision in ["fp32", "fp16", "fp08"], ValueError(
+        #     "Supported Precision: 'fp32' or 'fp16'"
+        # )
         assert device == "cuda", ValueError("Supported device: 'cuda'")
 
         self.engine_dir_path = Path(model_path)
         engine_files = list(self.engine_dir_path.glob("*.engine"))
 
-        if len(engine_files) == 0:
-            raise ValueError(".engine file does not exist. Try to build the engine.")
+        # if len(engine_files) == 0:
+        #     raise ValueError(".engine file does not exist. Try to build the engine.")
 
         self.engine_path = engine_files[0]
         self.config_path = self.engine_dir_path / "config.json"
@@ -170,12 +170,16 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
     )
     report = defaultdict(lambda: defaultdict(float))
 
-    for precision in ("fp16", "fp32"):
+    for precision in (
+        "fp32",
+        "fp16",
+        "fp08",
+    ):
         log_and_print(
             f"Running TensorRT LLM benchmark (pytorch backend) on Llama with precision: {precision}"
         )
         llama_tensorrt_benchmark = LlamaTensorRTMBenchmark(
-            model_path=f"{args.models_dir}/llama-2-7b-nvidia_tensorrt_build_{precision[2:]}",
+            model_path=f"{args.models_dir}/llama-2-7b-nvidia_tensorrt_build_08",
             device=args.device,
             precision=precision,
             tokenizer_path=f"{args.models_dir}/llama-2-7b-hf",

From 565f1ec523015c1d3f02eed604f1b187d9da3d33 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Mon, 29 Jan 2024 18:38:31 +0000
Subject: [PATCH 20/59] Added INT8 benchmark numbers

---
 docs/llama2.md.template | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 425086db..ef2ff77c 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -21,12 +21,12 @@
 | vllm                         | 90.78 ± 1.60 | 90.54 ± 2.22   |      -        |      -        |
 | exllamav2                    |      -       |      -         | 116.91 ± 1.73 | 164.28 ± 4.07 |
 | ctransformers                |      -       |      -         | 76.75 ± 10.36 | 84.26 ± 5.79  |
-| AutoGPTQ                     | 42.01 ± 1.03 | 30.24 ± 0.41  |      -        |      -        |
+| AutoGPTQ                     | 42.01 ± 1.03 | 30.24 ± 0.41   |      -        |      -        |
 | AutoAWQ                      |      -       |      -         |      -        | 116.94 ± 13.14|
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
-| Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
+| Nvidia TensorRT-LLM          | 54.86 ± 0.09 | 95.61 ± 2.46   | 164.49 ± 3.10 |      -        |
 
 *(Data updated: `<LAST_UPDATE>`)
 

From 1094d1854750bbf1ae8ad6bca519ae78d5a18b4a Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Tue, 30 Jan 2024 12:09:48 +0530
Subject: [PATCH 21/59] Updated Readme with latest info

---
 bench_burn/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bench_burn/README.md b/bench_burn/README.md
index a1a45217..7bbd0340 100644
--- a/bench_burn/README.md
+++ b/bench_burn/README.md
@@ -2,7 +2,7 @@
 
 [![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Gadersd/llama2-burn) &nbsp;
 
-We use [Llama2-Burn project](https://github.com/Gadersd/llama2-burn), which provides a port of the Llama2 model to [Burn](https://github.com/tracel-ai/burn). Burn is the DeepLearning Framework for Rust, which provides similar concepts and interfaces like PyTorch.
+[Burn](https://github.com/tracel-ai/burn) is a new comprehensive dynamic Deep Learning Framework built using Rust with extreme flexibility, compute efficiency and portability as its primary goals. For this benchmark implementation, we used a [forked version](https://github.com/premAI-io/llama2-burn) of the [Llama2-Burn project](https://github.com/Gadersd/llama2-burn)
 
 
 ### 🚀 Running the Burn Benchmark.
@@ -24,10 +24,10 @@ To get started quickly you can simply run:
 ```bash
 ./bench_burn/bench.sh -d cuda
 ```
-This will take all the default values (see in the [bench.sh](/bench_burn/bench.sh) file) and do the benchmarks. You can find all the benchmarks results for Burn [here](/docs/llama2.md).
+This will take all the default values (see in the [bench.sh](/bench_burn/bench.sh) file) and do the benchmarks. You can find all the benchmarks results for Burn [here](/docs/llama2.md). The HuggingFace Llama 2 weights through a conversion process before benchmarking. See [setup.sh](/bench_burn/setup.sh) to know more.
 
 
 ### 👀 Some points to note:
 
-1. For CUDA and Metal, Burn runs for only Float32 precision.
-2. You need to download weights of LLama-2 7B from HuggingFace. This repo already does it. However it assumes that you already have accepted the [terms and condition](https://huggingface.co/meta-llama/Llama-2-7b-hf) before running or downloading the model and runnning this benchmark.
+1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
+2. For CUDA and Metal, Burn runs for only Float32 precision.

From b21b46346d9ea4a638dd7dd818f76c3d0a29c5c4 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:02:20 +0000
Subject: [PATCH 22/59] Added latest info in Readme

---
 bench_optimum_nvidia/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench_optimum_nvidia/README.md b/bench_optimum_nvidia/README.md
index 1f89444a..6105f6e7 100644
--- a/bench_optimum_nvidia/README.md
+++ b/bench_optimum_nvidia/README.md
@@ -28,7 +28,7 @@ This will take all the default values (see in the [bench.sh](/bench_optimum_nvid
 
 ### 👀 Some points to note:
 
-1. Optimum Nvidia although supports quantized versions, but it uses AutoGPTQ under the hood. Since we wanted to do independent benchmarking, so we skipped it for now.
+1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
 2. Optimum Nvidia uses Docker to convert the models into a specific engine format. You can find the weight conversion logic under [setup.sh](/bench_optimum_nvidia/setup.sh) file.
 3. Optimum Nvidia only supports CUDA.
-4. Optimum Nvidia requires HuggingFace Llama2-7B weights. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
+4. Current implementation readily supports Float16/32 and FP-8 precision. We do not benchmark FP-8 precision, because that it can not be compared with other frameworks. And, INT8/4 seems not to be supported.

From e9e0abe141a97cbfff2e4ccb4cf8bc25a864d2e4 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:02:44 +0000
Subject: [PATCH 23/59] Now supporting both logging and printing the results

---
 bench_optimum_nvidia/bench.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/bench_optimum_nvidia/bench.py b/bench_optimum_nvidia/bench.py
index c2be08fd..67271696 100644
--- a/bench_optimum_nvidia/bench.py
+++ b/bench_optimum_nvidia/bench.py
@@ -17,7 +17,10 @@
     format="%(asctime)s - %(levelname)s - %(message)s",
 )
 
-# Optimum-Nvidia is meant for Nvidia GPU usage. Not any other platform is supported.
+
+def log_and_print(message: str) -> None:
+    print(message)
+    logging.info(message)
 
 
 class LlamaOptimumNvidiaBenchmark:
@@ -69,7 +72,7 @@ def run_model(self, prompt: str, max_tokens: int) -> float:
 
     def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
         for i in range(repetitions):
-            print(
+            log_and_print(
                 f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]"
             )
             tokens_per_second = self.run_model(prompt, max_tokens)
@@ -107,14 +110,14 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
         help="Path to the models directory.",
     )
     args = parser.parse_args()
-    print(
+    log_and_print(
         f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} "
         + f"repetitions={args.repetitions} device={args.device}"
     )
     report = defaultdict(lambda: defaultdict(float))
 
     for precision in ("fp16", "fp32"):
-        print(f"Running Optimum-Nvidia on Llama with precision: {precision}")
+        log_and_print(f"Running Optimum-Nvidia on Llama with precision: {precision}")
         llama_transformers_pytorch_benchmark = LlamaOptimumNvidiaBenchmark(
             model_path=args.models_dir,
             device=args.device,
@@ -128,11 +131,11 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             "mean": np.mean(llama_transformers_pytorch_benchmark.results),
             "std": np.std(llama_transformers_pytorch_benchmark.results),
         }
-    print("Benchmark Report")
+    log_and_print("Benchmark Report")
     with open(args.log_file, "a") as file:
         for framework, quantizations in report.items():
             for quantization, stats in quantizations.items():
-                print(
+                log_and_print(
                     f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
                 )
                 print(

From 753692c34f92829cbae3afa470321efcf46ebb03 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:03:06 +0000
Subject: [PATCH 24/59] Added latest info and CLI args for benchmark sh

---
 bench_optimum_nvidia/bench.sh | 50 ++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/bench_optimum_nvidia/bench.sh b/bench_optimum_nvidia/bench.sh
index db8f056e..7fee8f60 100755
--- a/bench_optimum_nvidia/bench.sh
+++ b/bench_optimum_nvidia/bench.sh
@@ -2,14 +2,14 @@
 
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks Nvidia-Optimum Llama benchmark.
+# Description: This script runs benchmarks Nvidia-Optimum Llama-2 benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
@@ -17,21 +17,21 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
-    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
     exit 1
 }
-
 check_cuda() {
     if command -v nvcc &> /dev/null
     then
@@ -67,6 +67,18 @@ check_python() {
 }
 
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh
 }
@@ -140,18 +152,19 @@ while [ "$#" -gt 0 ]; do
             ;;
     esac
 done
-# Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
-REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
-MODELS_DIR="${MODELS_DIR:-"/build"}"
 
 check_platform
 check_python
 setup
-# run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
+
+# Set default values if not provided
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
+REPETITIONS="${REPETITIONS:-10}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"/mnt/Logs/benchmark_optimum_nvidia_$(date +'%Y%m%d%H%M%S').log"}"
+MODELS_DIR="${MODELS_DIR:-"/build"}"
+
 
 docker run \
     --gpus all \
@@ -161,6 +174,7 @@ docker run \
     -e PYTHONUNBUFFERED=1 \
     -v "$(pwd)/models:/mnt/models" \
     -v "$(pwd)/models/llama-2-7b-optimum_nvidia_build:/build" \
+    -v "$LOGS_FOLDER:/mnt/Logs" \
     -v "$SCRIPT_DIR:/mnt/scripts" \
     -it prem/optimum-nvidia:base \
     python3 -u "/mnt/scripts/bench.py" \

From 75bf9eaab1922f5bdd3ab70ccc6ecbdf6a179ccc Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:03:55 +0000
Subject: [PATCH 25/59] Small bug fixes around image names

---
 bench_optimum_nvidia/setup.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bench_optimum_nvidia/setup.sh b/bench_optimum_nvidia/setup.sh
index 48a05512..0cb3f704 100755
--- a/bench_optimum_nvidia/setup.sh
+++ b/bench_optimum_nvidia/setup.sh
@@ -26,8 +26,8 @@ build_docker_image () {
     local repo_name="optimum-nvidia"
 
     # Check if the Docker image exists
-    if docker image inspect prem/optimum-nvidia:latest &> /dev/null; then
-        echo "Image 'prem/optimum-nvidia:latest' already exists."
+    if docker image inspect prem/optimum-nvidia:base &> /dev/null; then
+        echo "Image 'prem/optimum-nvidia:base' already exists."
         exit 0
     else
 
@@ -47,8 +47,8 @@ build_docker_image () {
 build_and_compile_model () {
     echo "Running and building the model inside Docker..."
 
-    if docker image inspect prem/optimum-nvidia:latest &> /dev/null; then
-        echo "Image 'prem/optimum-nvidia:latest' already exists."
+    if docker image inspect prem/optimum-nvidia:base &> /dev/null; then
+        echo "Image 'prem/optimum-nvidia:base' already exists."
         exit 0
     elif docker image inspect prem/optimum-nvidia:base &> /dev/null; then
         local model_build_path="$CURRENT_DIR/models/llama-2-7b-optimum_nvidia_build"

From 10cfcd705bdda1216cb45e4a3ee24d1560e10baf Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:04:14 +0000
Subject: [PATCH 26/59] Added latest benchmark numbers in docs

---
 docs/llama2.md.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index bd99774a..067831ba 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -25,7 +25,7 @@
 | AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
-| Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
+| Optimum Nvidia               | 100.42 ± 0.03| 99.81 ± 1.76   |      -        |      -        |
 | Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
 
 *(Data updated: `<LAST_UPDATE>`)

From 59a3245994444fdef32e5af7da8be5e15db92a6c Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:05:52 +0000
Subject: [PATCH 27/59] Added issue reference in not supported reasons

---
 bench_optimum_nvidia/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_optimum_nvidia/README.md b/bench_optimum_nvidia/README.md
index 6105f6e7..09eda95f 100644
--- a/bench_optimum_nvidia/README.md
+++ b/bench_optimum_nvidia/README.md
@@ -31,4 +31,4 @@ This will take all the default values (see in the [bench.sh](/bench_optimum_nvid
 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
 2. Optimum Nvidia uses Docker to convert the models into a specific engine format. You can find the weight conversion logic under [setup.sh](/bench_optimum_nvidia/setup.sh) file.
 3. Optimum Nvidia only supports CUDA.
-4. Current implementation readily supports Float16/32 and FP-8 precision. We do not benchmark FP-8 precision, because that it can not be compared with other frameworks. And, INT8/4 seems not to be supported.
+4. Current implementation readily supports Float16/32 and FP-8 precision. We do not benchmark FP-8 precision, because that it can not be compared with other frameworks. And, INT8/4 [does not](https://github.com/huggingface/optimum-nvidia/issues/26) seem to be supported currently.

From e5ee786e1d142db117b4d6cbdb1a8cbb6f7826d7 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:47:23 +0000
Subject: [PATCH 28/59] Added latest info for Readme and supported devices

---
 bench_burn/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bench_burn/README.md b/bench_burn/README.md
index 7bbd0340..d9e468f3 100644
--- a/bench_burn/README.md
+++ b/bench_burn/README.md
@@ -30,4 +30,5 @@ This will take all the default values (see in the [bench.sh](/bench_burn/bench.s
 ### 👀 Some points to note:
 
 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
-2. For CUDA and Metal, Burn runs for only Float32 precision.
+2. For CUDA and CPU, Burn runs for only Float32 precision.
+3. The current implementation of Llama2-Burn does not support Metal.

From 117be337029a97dee8148d1be1116f50882b3ad0 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:50:02 +0000
Subject: [PATCH 29/59] latest cli args, logs support, python3/python support

---
 bench_burn/bench.sh | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/bench_burn/bench.sh b/bench_burn/bench.sh
index 7dc44c63..4abed91d 100755
--- a/bench_burn/bench.sh
+++ b/bench_burn/bench.sh
@@ -2,7 +2,7 @@
 
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks burn llama benchmark.
+# Description: This script runs benchmarks Burn Llama-2 benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
@@ -17,6 +17,7 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
@@ -66,16 +67,28 @@ check_platform() {
 }
 
 check_python() {
-    if command -v python &> /dev/null
-    then
-        echo -e "\nUsing $(python --version)."
+    if command -v python &> /dev/null || command -v python3 &> /dev/null; then
+        echo "Python is installed."
     else
-        echo -e "\nPython does not exist."
+        echo "Python is not installed."
         exit 1
     fi
 }
 
+
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR/setup.sh" "$1"
 }
@@ -164,16 +177,18 @@ while [ "$#" -gt 0 ]; do
     esac
 done
 
-# Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
-REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
 MODELS_DIR="${MODELS_DIR:-"./models"}"
 
 check_platform
 check_rust
 check_python
 setup "$MODELS_DIR"
+
+# Set default values if not provided
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
+REPETITIONS="${REPETITIONS:-10}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_burn_$(date +'%Y%m%d%H%M%S').log"}"
+
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"

From 3311443b8d9e4c261b064484800aa6053d90489c Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:50:27 +0000
Subject: [PATCH 30/59] Added python3/python aliases in setup script

---
 bench_burn/setup.sh | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/bench_burn/setup.sh b/bench_burn/setup.sh
index de190c63..0279d7a0 100755
--- a/bench_burn/setup.sh
+++ b/bench_burn/setup.sh
@@ -8,6 +8,17 @@
 
 set -euo pipefail
 
+check_python() {
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
+    else
+        echo "Python is not installed."
+        exit 1
+    fi
+}
+
 if [ "$#" -ne 1 ]; then
     echo "Usage: $0 <models_folder>"
     exit 1
@@ -29,7 +40,7 @@ check_and_create_directory() {
 }
 
 if [ ! -d "$VENV_DIR" ]; then
-    python -m venv "$VENV_DIR"
+    "$PYTHON_CMD" -m venv "$VENV_DIR"
     echo "Virtual environment '$VENV_DIR' created."
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
@@ -47,7 +58,7 @@ if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then
 
     if [ ! -d "$BURN_MODEL_FOLDER/params" ]; then
         echo "Dumping model from $BURN_MODEL_INPUT_DIR to $BURN_MODEL_FOLDER"
-        python "$BURN_FOLDER/llama-py/dump_model.py" "$BURN_MODEL_INPUT_DIR" "$BURN_MODEL_INPUT_DIR/tokenizer.model"
+        "$PYTHON_CMD" "$BURN_FOLDER/llama-py/dump_model.py" "$BURN_MODEL_INPUT_DIR" "$BURN_MODEL_INPUT_DIR/tokenizer.model"
         mv "$(pwd)/params" "$BURN_MODEL_FOLDER"
         cp "$BURN_MODEL_INPUT_DIR/tokenizer.model" "$BURN_MODEL_FOLDER"
     else

From b4ee6df207b9e7fe8a8ef93cd9cf3c494f0f35d1 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:52:30 +0000
Subject: [PATCH 31/59] small bug fix in setup python

---
 bench_burn/setup.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bench_burn/setup.sh b/bench_burn/setup.sh
index 0279d7a0..ef63a72f 100755
--- a/bench_burn/setup.sh
+++ b/bench_burn/setup.sh
@@ -39,6 +39,8 @@ check_and_create_directory() {
     fi
 }
 
+check_python
+
 if [ ! -d "$VENV_DIR" ]; then
     "$PYTHON_CMD" -m venv "$VENV_DIR"
     echo "Virtual environment '$VENV_DIR' created."

From 8a443c620ea8d5ade018d1f96a32ed983bc1c548 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 12:50:27 +0000
Subject: [PATCH 32/59] Added python3/python aliases in setup script

---
 bench_burn/setup.sh | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/bench_burn/setup.sh b/bench_burn/setup.sh
index de190c63..ef63a72f 100755
--- a/bench_burn/setup.sh
+++ b/bench_burn/setup.sh
@@ -8,6 +8,17 @@
 
 set -euo pipefail
 
+check_python() {
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
+    else
+        echo "Python is not installed."
+        exit 1
+    fi
+}
+
 if [ "$#" -ne 1 ]; then
     echo "Usage: $0 <models_folder>"
     exit 1
@@ -28,8 +39,10 @@ check_and_create_directory() {
     fi
 }
 
+check_python
+
 if [ ! -d "$VENV_DIR" ]; then
-    python -m venv "$VENV_DIR"
+    "$PYTHON_CMD" -m venv "$VENV_DIR"
     echo "Virtual environment '$VENV_DIR' created."
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
@@ -47,7 +60,7 @@ if [ ! -e "$BURN_MODEL_FOLDER/$BURN_MODEL_NAME.cfg" ]; then
 
     if [ ! -d "$BURN_MODEL_FOLDER/params" ]; then
         echo "Dumping model from $BURN_MODEL_INPUT_DIR to $BURN_MODEL_FOLDER"
-        python "$BURN_FOLDER/llama-py/dump_model.py" "$BURN_MODEL_INPUT_DIR" "$BURN_MODEL_INPUT_DIR/tokenizer.model"
+        "$PYTHON_CMD" "$BURN_FOLDER/llama-py/dump_model.py" "$BURN_MODEL_INPUT_DIR" "$BURN_MODEL_INPUT_DIR/tokenizer.model"
         mv "$(pwd)/params" "$BURN_MODEL_FOLDER"
         cp "$BURN_MODEL_INPUT_DIR/tokenizer.model" "$BURN_MODEL_FOLDER"
     else

From 6a63511ffd663c8b68ab2eb5ac13f88ae1e5f0fa Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 13:43:23 +0000
Subject: [PATCH 33/59] Updated readme with latest info

---
 bench_burn/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bench_burn/README.md b/bench_burn/README.md
index d9e468f3..e7fb6bca 100644
--- a/bench_burn/README.md
+++ b/bench_burn/README.md
@@ -30,5 +30,6 @@ This will take all the default values (see in the [bench.sh](/bench_burn/bench.s
 ### 👀 Some points to note:
 
 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
-2. For CUDA and CPU, Burn runs for only Float32 precision.
+2. For CUDA, both Float16/32 is supported, where as for CPU only Float16 precision is supported.
 3. The current implementation of Llama2-Burn does not support Metal.
+4. The current implementation of Llama2-Burn does not support INT-4/8 precision quantized models.

From 5815d4891496b35b37635f62c4066e178da45836 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 13:43:55 +0000
Subject: [PATCH 34/59] Added extra echo to print the results

---
 bench_burn/bench.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bench_burn/bench.sh b/bench_burn/bench.sh
index 4abed91d..155da136 100755
--- a/bench_burn/bench.sh
+++ b/bench_burn/bench.sh
@@ -123,6 +123,7 @@ run_benchmarks() {
     )
     mean=$(echo "$benchmark_output" | grep -oP '\d+\.\d+ ± \d+\.\d+' | awk -F ' ± ' '{print $1}')
     std=$(echo "$benchmark_output" | grep -oP '\d+\.\d+ ± \d+\.\d+' | awk -F ' ± ' '{print $2}')
+    echo "burn, float16 : $(printf "%.2f" "$mean") ± $(printf "%.2f" "$std")"
     echo "burn, float16 : $(printf "%.2f" "$mean") ± $(printf "%.2f" "$std")" >> "$LOG_FILENAME"
 }
 # Parse command-line arguments

From bd5378bf5df9b246baeba3bbef3322fe47a0cd91 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 13:44:15 +0000
Subject: [PATCH 35/59] Added latest benchmarks info on template

---
 docs/llama2.md.template | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index bd99774a..fab93e4e 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -11,7 +11,7 @@
 
 | Engine                       | float32      | float16        | int8          | int4          |
 |------------------------------|--------------|----------------|---------------|---------------|
-| burn                         | 13.12 ± 0.85 |      -         |      -        |      -        |
+| burn                         | 10.04 ± 0.64 |      -         |      -        |      -        |
 | candle                       |      -       | 36.78 ± 2.17   |      -        |      -        |
 | llama.cpp                    |      -       |      -         | 79.15 ± 1.20  | 100.90 ± 1.46 |
 | ctranslate                   | 35.23 ± 4.01 | 55.72 ± 16.66  | 35.73 ± 10.87 |      -        |
@@ -43,7 +43,7 @@
 **Performance Metrics:** (unit: Tokens / second)
 | Engine                | float32      | float16      | int8         | int4         |
 |-----------------------|--------------|--------------|--------------|--------------|
-| burn                  | 0.30 ± 0.09  |      -       |      -       |      -       |
+| burn                  | 0.21 ± 0.12  |      -       |      -       |      -       |
 | candle                |      -       | 3.43 ± 0.02  |      -       |      -       |
 | llama.cpp             |      -       |      -       | 13.24 ± 0.62 | 21.43 ± 0.47 |
 | ctranslate            |      -       |      -       | 1.87 ± 0.14  |      -       |

From 059a7c52037a260de1267e98c699314ea162ac18 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 17:20:59 +0000
Subject: [PATCH 36/59] small fix on Readme

---
 bench_candle/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_candle/README.md b/bench_candle/README.md
index 5a7589cd..48286462 100644
--- a/bench_candle/README.md
+++ b/bench_candle/README.md
@@ -29,5 +29,5 @@ This will take all the default values (see in the [bench.sh](/bench_candle/bench
 
 ### 👀 Some points to note:
 
-1. Candle does not support Float32 from the latest implementation. This implementation of Candle Llama2 does not support quantized weights of int8/4 precisions.
+1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
 2. Candle does not have support for Metal devices.

From 0d02daa4c82ff6cfe52a78b171970034196f277e Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 18:54:08 +0000
Subject: [PATCH 37/59] Refactor: major script bug fixes and int4 installation
 support

---
 bench_tensorrtllm/setup.sh | 282 ++++++++++++++++++++++++-------------
 1 file changed, 185 insertions(+), 97 deletions(-)

diff --git a/bench_tensorrtllm/setup.sh b/bench_tensorrtllm/setup.sh
index aa4479d2..70e214cd 100755
--- a/bench_tensorrtllm/setup.sh
+++ b/bench_tensorrtllm/setup.sh
@@ -21,6 +21,9 @@ check_docker() {
     fi
 }
 
+
+# Build the docker image
+
 build_docker_image () {
     # Todo: might require to clone a Patched version.
     local repo_name="TensorRT-LLM"
@@ -49,123 +52,208 @@ build_docker_image () {
     cd "$CURRENT_DIR"
 }
 
-build_and_compile_model () {
-    set -e  # Exit on error
 
-    echo "Running and building the model inside Docker..."
+# build and compile different models
 
+build_engine_float32 () {
     local model_build_path_32="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_32"
+
+    if [ ! -d "$model_build_path_32" ]; then
+        mkdir -p "$model_build_path_32"
+        echo "Building the model engine file for fp32 precision ..."
+        docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+            --gpus all \
+            -v "$CURRENT_DIR"/models:/mnt/models \
+            -v "$model_build_path_32":/tensorrt_nvidia_build_32 \
+            -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+            --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+            --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+            --workdir /code/tensorrt_llm \
+            --hostname psqh4m1l0zhx-release \
+            --name tensorrt_llm-release-prem \
+            --tmpfs /tmp:exec \
+            tensorrt_llm/release:latest \
+            python3 examples/llama/build.py \
+                --model_dir /mnt/models/llama-2-7b-hf \
+                --dtype float32 \
+                --max_batch_size 1 \
+                --max_input_len 3000 \
+                --max_output_len 1024 \
+                --output_dir /tensorrt_nvidia_build_32
+    else
+        echo "Engine file for Llama 2 fp32 precision already exists. Skipping ..."
+    fi
+}
+
+build_engine_float16 () {
     local model_build_path_16="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_16"
+
+    if [ ! -d "$model_build_path_16" ]; then
+        mkdir -p "$model_build_path_16"
+        echo "Building the model engine file for fp16 precision ..."
+        docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+            --gpus all \
+            -v "$CURRENT_DIR"/models:/mnt/models \
+            -v "$model_build_path_16":/tensorrt_nvidia_build_16 \
+            -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+            --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+            --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+            --workdir /code/tensorrt_llm \
+            --hostname psqh4m1l0zhx-release \
+            --name tensorrt_llm-release-prem \
+            --tmpfs /tmp:exec \
+            tensorrt_llm/release:latest \
+            python3 examples/llama/build.py \
+                --model_dir /mnt/models/llama-2-7b-hf \
+                --dtype float16 \
+                --max_batch_size 1 \
+                --max_input_len 3000 \
+                --max_output_len 1024 \
+                --output_dir /tensorrt_nvidia_build_16
+    else
+        echo "Engine file for Llama 2 fp16 precision already exists. Skipping ..."
+    fi
+}
+
+build_engine_int8 () {
+
     local model_build_path_08="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_08"
-    local model_build_path_04="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_04"
 
+    if [ ! -d "$model_build_path_08" ]; then
+        mkdir -p "$model_build_path_08"
+        echo "Generating binaries for each model layers in mixed fp16-int8 precision ..."
 
-    if docker image inspect tensorrt_llm/release:latest &> /dev/null; then
-        if [ ! -d "$model_build_path_32" ]; then
-            mkdir -p "$model_build_path_32"
-            echo "Building model build (FP32 precision) with Docker..."
-            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
-                --gpus=all \
-                -v "$CURRENT_DIR"/models:/models \
-                -v "$model_build_path_32":/tensorrt_nvidia_build_32 \
-                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
-                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
-                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-                --workdir /code/tensorrt_llm \
-                --hostname psqh4m1l0zhx-release \
-                --name tensorrt_llm-release-paperspace \
-                --tmpfs /tmp:exec \
-                tensorrt_llm/release:latest \
-                python3 ./examples/llama/build.py --model_dir /models/llama-2-7b-hf --dtype float32 --max_batch_size 1 --max_input_len 3000 --max_output_len 1024 --output_dir /tensorrt_nvidia_build_32
-        else
-            echo "Engine file for Llama 2 build FP32 already exists. Skipping ..."
-        fi
+        docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+            --gpus all \
+            -v "$CURRENT_DIR"/models:/mnt/models \
+            -v "$model_build_path_08":/tensorrt_nvidia_build_08 \
+            -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+            --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+            --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+            --workdir /code/tensorrt_llm \
+            --hostname psqh4m1l0zhx-release \
+            --name tensorrt_llm-release-prem \
+            --tmpfs /tmp:exec \
+            python3 examples/llama/hf_llama_convert.py -i /mnt/models/llama-2-7b-hf \
+                -o /tensorrt_nvidia_build_08 \
+                --calibrate-kv-cache -t fp16
+    fi
 
-        if [ ! -d "$model_build_path_16" ]; then
-            mkdir -p "$model_build_path_16"
-            echo "Building model build (FP16 precision) with Docker..."
-            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
-                --gpus=all \
-                -v "$CURRENT_DIR"/models:/models \
-                -v "$model_build_path_16":/tensorrt_nvidia_build_16 \
-                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
-                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
-                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-                --workdir /code/tensorrt_llm \
-                --hostname psqh4m1l0zhx-release \
-                --name tensorrt_llm-release-paperspace \
-                --tmpfs /tmp:exec \
-                tensorrt_llm/release:latest \
-                python3 ./examples/llama/build.py --model_dir /models/llama-2-7b-hf --dtype float16 --max_batch_size 1 --max_input_len 3000 --max_output_len 1024 --output_dir /tensorrt_nvidia_build_16
+
+    # now check if the folder exists but not the engine file
+    if [ -d "$model_build_path_08" ] && [ ! "$(find "$model_build_path_08" -maxdepth 1 | wc -l)" -gt 2 ]; then
+        echo "Building the model engine file for fp16-int8 mixed precision ..."
+        docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+            --gpus all \
+            -v "$CURRENT_DIR"/models:/mnt/models \
+            -v "$model_build_path_08":/tensorrt_nvidia_build_08 \
+            -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+            --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+            --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+            --workdir /code/tensorrt_llm \
+            --hostname psqh4m1l0zhx-release \
+            --name tensorrt_llm-release-prem \
+            --tmpfs /tmp:exec \
+            tensorrt_llm/release:latest \
+            python3 examples/llama/build.py \
+                --bin_model_dir /tensorrt_nvidia_build_08/1-gpu \
+                --dtype float16 \
+                --use_gpt_attention_plugin float16 \
+                --use_gemm_plugin float16 \
+                --int8_kv_cache \
+                --use_weight_only \
+                --output_dir /tensorrt_nvidia_build_08
+    else
+        if [ -d "$model_build_path_08" ] && [ -d "$model_build_path_08/1-gpu" ]; then
+            echo "Engine file for Llama 2 build INT-8 already exists. Skipping ..."
         else
-            echo "Engine file for Llama 2 build FP16 already exists. Skipping ..."
+            echo "There is a problem with the model build directories. Please retry."
         fi
+    fi
+}
+
+build_engine_int4 () {
+    local model_build_path_04="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_04"
+
+    if [ ! -d "$model_build_path_04" ]; then
+        mkdir -p "$model_build_path_04"
+        echo "Generating binaries for each model layers in mixed fp16-int4 precision ..."
+
+        docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+            --gpus all \
+            -v "$CURRENT_DIR"/models:/mnt/models \
+            -v "$model_build_path_04":/tensorrt_nvidia_build_04 \
+            -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+            --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+            --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+            --workdir /code/tensorrt_llm \
+            --hostname psqh4m1l0zhx-release \
+            --name tensorrt_llm-release-prem \
+            --tmpfs /tmp:exec \
+            tensorrt_llm/release:latest \
+            python3 examples/quantization/quantize.py --model_dir /mnt/models/llama-2-7b-hf \
+                --dtype float16 \
+                --qformat int4_awq \
+                --export_path /tensorrt_nvidia_build_04 \
+                --calib_size 32
+
+    fi
 
-        if [ ! -d "$model_build_path_08" ]; then
-            mkdir -p "$model_build_path_08"
-            echo "Generating binaries for each of the model layers"
-
-            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
-                --gpus=all \
-                -v "$CURRENT_DIR"/models:/models \
-                -v "$model_build_path_08":/tensorrt_nvidia_build_08 \
-                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
-                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
-                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-                --workdir /code/tensorrt_llm \
-                --hostname psqh4m1l0zhx-release \
-                --name tensorrt_llm-release-paperspace \
-                --tmpfs /tmp:exec \
-                tensorrt_llm/release:latest \
-                python3 ./examples/llama/hf_llama_convert.py -i /models/llama-2-7b-hf -o /tensorrt_nvidia_build_08 --calibrate-kv-cache -t fp16 \
-
-        elif [ ! "$(find "$model_build_path_08" -maxdepth 1 | wc -l)" -gt 2 ]; then
-            echo "Building model build (FP08 precision) with Docker..."
-            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
-                --gpus=all \
-                -v "$CURRENT_DIR"/models:/models \
-                -v "$model_build_path_08":/tensorrt_nvidia_build_08 \
-                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
-                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
-                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-                --workdir /code/tensorrt_llm \
-                --hostname psqh4m1l0zhx-release \
-                --name tensorrt_llm-release-paperspace \
-                --tmpfs /tmp:exec \
-                tensorrt_llm/release:latest \
-                python3 ./examples/llama/build.py --bin_model_dir /tensorrt_nvidia_build_08/1-gpu --dtype float16 --use_gpt_attention_plugin float16 --use_gemm_plugin float16 --int8_kv_cache --output_dir /tensorrt_nvidia_build_08 --use_weight_only
+    # now build the engine file
+    if [ -d "$model_build_path_04" ] && [ ! "$(find "$model_build_path_04" -maxdepth 1 | wc -l)" -gt 3 ]; then
+        echo "Building the model engine file for fp16-int4 mixed precision ..."
+        docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
+            --gpus all \
+            -v "$CURRENT_DIR"/models:/mnt/models \
+            -v "$model_build_path_04":/tensorrt_nvidia_build_04 \
+            -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
+            --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
+            --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
+            --workdir /code/tensorrt_llm \
+            --hostname psqh4m1l0zhx-release \
+            --name tensorrt_llm-release-prem \
+            --tmpfs /tmp:exec \
+            tensorrt_llm/release:latest \
+            python3 examples/llama/build.py --model_dir /mnt/models/llama-2-7b-hf \
+                --quant_ckpt_path /tensorrt_nvidia_build_04/llama_tp1_rank0.npz \
+                --dtype float16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin float16 \
+                --enable_context_fmha \
+                --use_gemm_plugin float16 \
+                --use_weight_only \
+                --weight_only_precision int4_awq \
+                --per_group \
+                --output_dir /tensorrt_nvidia_build_04
+    else
+        if [ -d "$model_build_path_04" ] && [ -d "$model_build_path_04" ]; then
+            echo "Engine file for Llama 2 build int4 already exists. Skipping ..."
         else
-            if [ -d "$model_build_path_08" ] && [ -d "$model_build_path_08/1-gpu" ]; then
-                echo "Engine file for Llama 2 build INT-8 already exists. Skipping ..."
-            else
-                echo "There is a problem with the model build directories. Please retry."
-            fi
+            echo "There is a problem with the model build directories. Please retry ..."
         fi
+    fi
+}
 
-        if [ ! -d "$model_build_path_04" ]; then
-            docker run --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864  \
-                --gpus=all \
-                -v "$CURRENT_DIR"/models:/models \
-                -v "$model_build_path_04":/tensorrt_nvidia_build_04 \
-                -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
-                --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
-                --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-                --workdir /code/tensorrt_llm \
-                --hostname psqh4m1l0zhx-release \
-                --name tensorrt_llm-release-paperspace \
-                --tmpfs /tmp:exec \
-                tensorrt_llm/release:latest \
-                python3 ./examples/quantization/quantize.py --model_dir /models/llama-2-7b-hf --dtype float16 --qformat int4_awq --export_path /tensorrt_nvidia_build_04 --calib_size 32
-        fi
+
+# Build all the engines one by one
+
+build_and_compile_all_engines () {
+    if docker image inspect tensorrt_llm/release:latest &> /dev/null; then
+        build_engine_float32
+        build_engine_float16
+        build_engine_int8
+        build_engine_int4
     else
-        echo "Docker image does not exist ... "
+        echo "Docker image does not exist, please build the docker image first ..."
     fi
 }
 
+# Main entrypoint
+
 if check_docker; then
     build_docker_image
-    build_and_compile_model
+    build_and_compile_all_engines
 else
-    echo "Docker is not installed or not in the PATH"
+    echo "Docker is not installed or not in the PATH, please make sure, docker is installed properly ..."
     exit 1
 fi

From e77dec721107e65abb71b1c8be26b75f2c4b9b5f Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 18:55:04 +0000
Subject: [PATCH 38/59] Added benchmarks scores for llama2 template

---
 docs/llama2.md.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 46df48cc..8a20ccd0 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -26,7 +26,7 @@
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
-| Nvidia TensorRT-LLM          | 54.86 ± 0.09 | 95.61 ± 2.46   | 164.49 ± 3.10 |      -        |
+| Nvidia TensorRT-LLM          | 54.86 ± 0.09 | 95.61 ± 2.46   | 164.49 ± 3.10 | 230.91 ± 10.76|
 
 *(Data updated: `<LAST_UPDATE>`)
 

From a0391002db1f5b6abb1b88856e501ce4a64608d2 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Tue, 30 Jan 2024 18:55:04 +0000
Subject: [PATCH 39/59] Added benchmarks scores for int4 precision in llama2
 template

---
 docs/llama2.md.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 46df48cc..8a20ccd0 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -26,7 +26,7 @@
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
-| Nvidia TensorRT-LLM          | 54.86 ± 0.09 | 95.61 ± 2.46   | 164.49 ± 3.10 |      -        |
+| Nvidia TensorRT-LLM          | 54.86 ± 0.09 | 95.61 ± 2.46   | 164.49 ± 3.10 | 230.91 ± 10.76|
 
 *(Data updated: `<LAST_UPDATE>`)
 

From 904c5570c156a77cc0c87cba88d8f7a91b7e76d8 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 07:27:11 +0000
Subject: [PATCH 40/59] Added some more info in Readme regarding model
 precision

---
 bench_tensorrtllm/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bench_tensorrtllm/README.md b/bench_tensorrtllm/README.md
index c614e163..a0816b0d 100644
--- a/bench_tensorrtllm/README.md
+++ b/bench_tensorrtllm/README.md
@@ -30,3 +30,4 @@ This will take all the default values (see in the [bench.sh](/bench_tensorrt_llm
 
 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
 2. TensorRT LLM only works with CUDA. So it does not support Metal/CPU.
+3. For benchmarking quantized models on INT4/8 precision, TensorRT-LLM does not fully quantizes the model to INT8/4, rather it applies Mixed Precison quantization technique. So instead of INT4/8 we use Float16-INT4/8 quantized models. You can learn more about it in the [TensorRT-LLM Llama2 example](https://github.com/NVIDIA/TensorRT-LLM/blob/release/0.5.0/examples/llama/README.md).

From 73c759962b60d61b426615f4026ebe3099a867eb Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 07:28:52 +0000
Subject: [PATCH 41/59] Bug fixes and added benchmark support for int4
 precision

---
 bench_tensorrtllm/bench.py | 40 ++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/bench_tensorrtllm/bench.py b/bench_tensorrtllm/bench.py
index 1afb4674..cf0d8cf3 100644
--- a/bench_tensorrtllm/bench.py
+++ b/bench_tensorrtllm/bench.py
@@ -13,7 +13,7 @@
 from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 from transformers import AutoTokenizer
 
-logging.getLogger("ctranslate2").setLevel(logging.ERROR)
+logging.getLogger("tensorrt_llm").setLevel(logging.ERROR)
 logging.basicConfig(
     stream=sys.stdout,
     level=logging.INFO,
@@ -34,30 +34,31 @@ def __init__(
         precision: str,
         device: Optional[str] = "cuda",
     ) -> None:
-        # assert precision in ["fp32", "fp16", "fp08"], ValueError(
-        #     "Supported Precision: 'fp32' or 'fp16'"
-        # )
-        assert device == "cuda", ValueError("Supported device: 'cuda'")
+        assert device == "cuda", ValueError("Device other CUDA is not Supported")
+        assert precision in ["fp32", "fp16", "int8", "int4"], ValueError(
+            "Supported Precision: 'fp32', 'fp16', 'int8' and 'int4'"
+        )
 
         self.engine_dir_path = Path(model_path)
         engine_files = list(self.engine_dir_path.glob("*.engine"))
 
-        # if len(engine_files) == 0:
-        #     raise ValueError(".engine file does not exist. Try to build the engine.")
+        if len(engine_files) == 0:
+            raise ValueError(f"Model path: {model_path} does not consist .engine file")
 
         self.engine_path = engine_files[0]
         self.config_path = self.engine_dir_path / "config.json"
 
-        self.precision, self.device = precision, device
+        self.precision, self.device, self.tokenizer_path = (
+            precision,
+            device,
+            tokenizer_path,
+        )
         self.results = []
-        self.tokenizer_path = tokenizer_path
 
-    def load_model(self):
+    def load_model(self) -> None:
         with open(self.config_path) as f:
             config = json.load(f)
 
-        # set the precision here
-
         use_gpt_attention_plugin = config["plugin_config"]["gpt_attention_plugin"]
         remove_input_padding = config["plugin_config"]["remove_input_padding"]
         tp_size = config["builder_config"]["tensor_parallel"]
@@ -91,14 +92,15 @@ def load_model(self):
         )
 
         torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
-
         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
+
         with open(self.engine_path, "rb") as f:
             engine_buffer = f.read()
 
         self.model = tensorrt_llm.runtime.GenerationSession(
             model_config, engine_buffer, runtime_mapping
         )
+
         return self
 
     def run_model(self, input_ids, input_lengths, sampling_config):
@@ -169,17 +171,13 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
         + f"repetitions={args.repetitions} device={args.device}"
     )
     report = defaultdict(lambda: defaultdict(float))
-
-    for precision in (
-        "fp32",
-        "fp16",
-        "fp08",
-    ):
+    for precision in ["fp32", "fp16", "int8", "int4"]:
         log_and_print(
             f"Running TensorRT LLM benchmark (pytorch backend) on Llama with precision: {precision}"
         )
+
         llama_tensorrt_benchmark = LlamaTensorRTMBenchmark(
-            model_path=f"{args.models_dir}/llama-2-7b-nvidia_tensorrt_build_08",
+            model_path=f"{args.models_dir}/llama-2-7b-nvidia_tensorrt_build_{precision}",
             device=args.device,
             precision=precision,
             tokenizer_path=f"{args.models_dir}/llama-2-7b-hf",
@@ -189,7 +187,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions
         )
 
-        report["llama_transformers_tensorrt"][precision] = {
+        report["llama_tensorrt_llm"][precision] = {
             "mean": np.mean(llama_tensorrt_benchmark.results),
             "std": np.std(llama_tensorrt_benchmark.results),
         }

From da01884e1a9f34600252bd03725988a19d762325 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 07:29:25 +0000
Subject: [PATCH 42/59] Final bug fix and cleanups in setup and model build
 scripts

---
 bench_tensorrtllm/setup.sh | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/bench_tensorrtllm/setup.sh b/bench_tensorrtllm/setup.sh
index 70e214cd..31ca18da 100755
--- a/bench_tensorrtllm/setup.sh
+++ b/bench_tensorrtllm/setup.sh
@@ -56,7 +56,7 @@ build_docker_image () {
 # build and compile different models
 
 build_engine_float32 () {
-    local model_build_path_32="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_32"
+    local model_build_path_32="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_fp32"
 
     if [ ! -d "$model_build_path_32" ]; then
         mkdir -p "$model_build_path_32"
@@ -68,12 +68,12 @@ build_engine_float32 () {
             -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
             --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
             --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-            --workdir /code/tensorrt_llm \
+            --workdir /app/tensorrt_llm \
             --hostname psqh4m1l0zhx-release \
             --name tensorrt_llm-release-prem \
             --tmpfs /tmp:exec \
             tensorrt_llm/release:latest \
-            python3 examples/llama/build.py \
+            python3 ./examples/llama/build.py \
                 --model_dir /mnt/models/llama-2-7b-hf \
                 --dtype float32 \
                 --max_batch_size 1 \
@@ -86,7 +86,7 @@ build_engine_float32 () {
 }
 
 build_engine_float16 () {
-    local model_build_path_16="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_16"
+    local model_build_path_16="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_fp16"
 
     if [ ! -d "$model_build_path_16" ]; then
         mkdir -p "$model_build_path_16"
@@ -98,12 +98,12 @@ build_engine_float16 () {
             -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
             --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
             --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-            --workdir /code/tensorrt_llm \
+            --workdir /app/tensorrt_llm \
             --hostname psqh4m1l0zhx-release \
             --name tensorrt_llm-release-prem \
             --tmpfs /tmp:exec \
             tensorrt_llm/release:latest \
-            python3 examples/llama/build.py \
+            python3 ./examples/llama/build.py \
                 --model_dir /mnt/models/llama-2-7b-hf \
                 --dtype float16 \
                 --max_batch_size 1 \
@@ -117,7 +117,7 @@ build_engine_float16 () {
 
 build_engine_int8 () {
 
-    local model_build_path_08="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_08"
+    local model_build_path_08="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_int8"
 
     if [ ! -d "$model_build_path_08" ]; then
         mkdir -p "$model_build_path_08"
@@ -130,11 +130,12 @@ build_engine_int8 () {
             -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
             --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
             --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-            --workdir /code/tensorrt_llm \
+            --workdir /app/tensorrt_llm \
             --hostname psqh4m1l0zhx-release \
             --name tensorrt_llm-release-prem \
             --tmpfs /tmp:exec \
-            python3 examples/llama/hf_llama_convert.py -i /mnt/models/llama-2-7b-hf \
+            tensorrt_llm/release:latest \
+            python3 ./examples/llama/hf_llama_convert.py -i /mnt/models/llama-2-7b-hf \
                 -o /tensorrt_nvidia_build_08 \
                 --calibrate-kv-cache -t fp16
     fi
@@ -150,12 +151,12 @@ build_engine_int8 () {
             -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
             --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
             --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-            --workdir /code/tensorrt_llm \
+            --workdir /app/tensorrt_llm \
             --hostname psqh4m1l0zhx-release \
             --name tensorrt_llm-release-prem \
             --tmpfs /tmp:exec \
             tensorrt_llm/release:latest \
-            python3 examples/llama/build.py \
+            python3 ./examples/llama/build.py \
                 --bin_model_dir /tensorrt_nvidia_build_08/1-gpu \
                 --dtype float16 \
                 --use_gpt_attention_plugin float16 \
@@ -173,7 +174,7 @@ build_engine_int8 () {
 }
 
 build_engine_int4 () {
-    local model_build_path_04="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_04"
+    local model_build_path_04="$CURRENT_DIR/models/llama-2-7b-nvidia_tensorrt_build_int4"
 
     if [ ! -d "$model_build_path_04" ]; then
         mkdir -p "$model_build_path_04"
@@ -186,12 +187,12 @@ build_engine_int4 () {
             -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
             --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
             --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-            --workdir /code/tensorrt_llm \
+            --workdir /app/tensorrt_llm \
             --hostname psqh4m1l0zhx-release \
             --name tensorrt_llm-release-prem \
             --tmpfs /tmp:exec \
             tensorrt_llm/release:latest \
-            python3 examples/quantization/quantize.py --model_dir /mnt/models/llama-2-7b-hf \
+            python3 ./examples/quantization/quantize.py --model_dir /mnt/models/llama-2-7b-hf \
                 --dtype float16 \
                 --qformat int4_awq \
                 --export_path /tensorrt_nvidia_build_04 \
@@ -209,12 +210,12 @@ build_engine_int4 () {
             -v "$SCRIPT_DIR"/TensorRT-LLM:/code/tensorrt_llm \
             --env "CCACHE_DIR=/code/tensorrt_llm/cpp/.ccache" \
             --env "CCACHE_BASEDIR=/code/tensorrt_llm" \
-            --workdir /code/tensorrt_llm \
+            --workdir /app/tensorrt_llm \
             --hostname psqh4m1l0zhx-release \
             --name tensorrt_llm-release-prem \
             --tmpfs /tmp:exec \
             tensorrt_llm/release:latest \
-            python3 examples/llama/build.py --model_dir /mnt/models/llama-2-7b-hf \
+            python3 ./examples/llama/build.py --model_dir /mnt/models/llama-2-7b-hf \
                 --quant_ckpt_path /tensorrt_nvidia_build_04/llama_tp1_rank0.npz \
                 --dtype float16 \
                 --remove_input_padding \

From fde29d0f47c19fde1ef735c679071b8618e74cee Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 07:29:56 +0000
Subject: [PATCH 43/59] Added latest benchmark info of tensorrt-llm in
 llama2-template

---
 docs/llama2.md.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 8a20ccd0..db8c5056 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -26,7 +26,7 @@
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
-| Nvidia TensorRT-LLM          | 54.86 ± 0.09 | 95.61 ± 2.46   | 164.49 ± 3.10 | 230.91 ± 10.76|
+| Nvidia TensorRT-LLM          | 55.19 ± 1.03 | 85.03 ± 0.62   | 167.66 ± 2.05 | 235.18 ± 3.20|
 
 *(Data updated: `<LAST_UPDATE>`)
 

From 41e6b71297a593e49098a8d3d7df934482387ad4 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 13:21:44 +0000
Subject: [PATCH 44/59] typo fix in readme

---
 bench_tinygrad/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_tinygrad/README.md b/bench_tinygrad/README.md
index dbed1b26..ca9e9fde 100644
--- a/bench_tinygrad/README.md
+++ b/bench_tinygrad/README.md
@@ -2,7 +2,7 @@
 
 [![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/tinygrad/tinygrad) &nbsp;
 
-TinyGrad is a minimalistic deep learning framework, very similar to [PyTorch](https://github.com/pytorch/pytorch). It's simplicity is inspired from the [micrograd](https://github.com/karpathy/micrograd) implementation by [Andrej Karpathy](https://karpathy.ai/). TinyGrad leverages uses different methods like lazy computation and kernel fusion techniques to run different operations. It supports various accelerators out of the box, including CPU, GPU etc. This benchmark implementation uses the [Llama 2 example](https://github.com/tinygrad/tinygrad/blob/master/examples/llama.py) written inside tinygrad/examples.
+TinyGrad is a minimalistic deep learning framework, very similar to [PyTorch](https://github.com/pytorch/pytorch). It's simplicity is inspired from the [micrograd](https://github.com/karpathy/micrograd) implementation by [Andrej Karpathy](https://karpathy.ai/). TinyGrad leverages different methods like lazy computation and kernel fusion techniques to run different operations. It supports various accelerators out of the box, including CPU, GPU etc. This benchmark implementation uses the [Llama 2 example](https://github.com/tinygrad/tinygrad/blob/master/examples/llama.py) written inside tinygrad/examples.
 
 
 ### 🚀 Running the TinyGrad Benchmark.

From 7e9655a9880ab8382c8cf61a291ceab949acc64d Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 13:22:22 +0000
Subject: [PATCH 45/59] Added support for python aliases, logging support and
 updated cli args

---
 bench_tinygrad/bench.sh | 56 ++++++++++++++++++++++++++---------------
 bench_tinygrad/setup.sh | 15 ++++++++++-
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/bench_tinygrad/bench.sh b/bench_tinygrad/bench.sh
index 4bb09d75..66d7b1bf 100755
--- a/bench_tinygrad/bench.sh
+++ b/bench_tinygrad/bench.sh
@@ -2,14 +2,14 @@
 
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks tinygrad llama benchmark.
+# Description: This script runs benchmarks PyTorch-Transformers llama benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
@@ -17,15 +17,16 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
@@ -57,16 +58,29 @@ check_platform() {
 }
 
 check_python() {
-    if command -v python &> /dev/null
-    then
-        echo -e "\nUsing $(python --version)."
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
     else
-        echo -e "\nPython does not exist."
+        echo "Python is not installed."
         exit 1
     fi
 }
 
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh "$1"
 }
@@ -86,7 +100,7 @@ run_llama_experiment() {
     declare -a tokens_per_second_array=()
 
     for ((i=1; i<=repetitions; i++)); do
-        tokens_per_second=$(python "$script_dir/tinygrad/examples/llama.py" \
+        tokens_per_second=$("$PYTHON_CMD" "$script_dir/tinygrad/examples/tiny.py" \
             --model "$models_dir/llama-2-7b-raw" \
             --prompt "$prompt" \
             --count "$max_tokens" \
@@ -179,15 +193,17 @@ while [ "$#" -gt 0 ]; do
             ;;
     esac
 done
-# Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
-REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
-MODELS_DIR="${MODELS_DIR:-"./models"}"
 
 check_platform
 check_python
 setup "$DEVICE"
+
+# Set default values if not provided
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
+REPETITIONS="${REPETITIONS:-10}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_pytorch_$(date +'%Y%m%d%H%M%S').log"}"
+MODELS_DIR="${MODELS_DIR:-"./models"}"
+
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
diff --git a/bench_tinygrad/setup.sh b/bench_tinygrad/setup.sh
index 5f1daa51..a8806c97 100755
--- a/bench_tinygrad/setup.sh
+++ b/bench_tinygrad/setup.sh
@@ -12,8 +12,21 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VENV_DIR="$SCRIPT_DIR/venv"
 
+check_python() {
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
+    else
+        echo "Python is not installed."
+        exit 1
+    fi
+}
+
+check_python
+
 if [ ! -d "$VENV_DIR" ]; then
-    python -m venv "$VENV_DIR"
+    "$PYTHON_CMD" -m venv "$VENV_DIR"
     echo "Virtual environment '$VENV_DIR' created."
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"

From 5ea3e92425f59d065c7215a2aa2221df356c3df8 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 31 Jan 2024 18:59:03 +0530
Subject: [PATCH 46/59] revised the info in Readme and corrected precision to
 fp32

---
 bench_burn/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_burn/README.md b/bench_burn/README.md
index e7fb6bca..b1978a3b 100644
--- a/bench_burn/README.md
+++ b/bench_burn/README.md
@@ -30,6 +30,6 @@ This will take all the default values (see in the [bench.sh](/bench_burn/bench.s
 ### 👀 Some points to note:
 
 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
-2. For CUDA, both Float16/32 is supported, where as for CPU only Float16 precision is supported.
+2. The current implementation of Llama2-Burn only supports Float32 precision for CUDA and CPU.
 3. The current implementation of Llama2-Burn does not support Metal.
 4. The current implementation of Llama2-Burn does not support INT-4/8 precision quantized models.

From 4f1402ca1ff1199fbf06c59daf21a351aa2e7596 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 31 Jan 2024 18:59:22 +0530
Subject: [PATCH 47/59] Corrected precision logging to fp32 instead of fp16

---
 bench_burn/bench.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench_burn/bench.sh b/bench_burn/bench.sh
index 155da136..b8a50c30 100755
--- a/bench_burn/bench.sh
+++ b/bench_burn/bench.sh
@@ -123,8 +123,8 @@ run_benchmarks() {
     )
     mean=$(echo "$benchmark_output" | grep -oP '\d+\.\d+ ± \d+\.\d+' | awk -F ' ± ' '{print $1}')
     std=$(echo "$benchmark_output" | grep -oP '\d+\.\d+ ± \d+\.\d+' | awk -F ' ± ' '{print $2}')
-    echo "burn, float16 : $(printf "%.2f" "$mean") ± $(printf "%.2f" "$std")"
-    echo "burn, float16 : $(printf "%.2f" "$mean") ± $(printf "%.2f" "$std")" >> "$LOG_FILENAME"
+    echo "burn, float32 : $(printf "%.2f" "$mean") ± $(printf "%.2f" "$std")"
+    echo "burn, float32 : $(printf "%.2f" "$mean") ± $(printf "%.2f" "$std")" >> "$LOG_FILENAME"
 }
 # Parse command-line arguments
 while [ "$#" -gt 0 ]; do

From 6d8a7e293d4131ecd2361d2a0b5c56301d03958f Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Wed, 31 Jan 2024 16:01:05 +0000
Subject: [PATCH 48/59] Update <LAST_UPDATE> placeholder in llama2.md and
 README.md

---
 README.md      | 4 ++--
 docs/llama2.md | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index f1f30473..5ef3d942 100644
--- a/README.md
+++ b/README.md
@@ -45,10 +45,10 @@ Take a first glance of Llama-2-7B Model Performance Metrics Across Different Pre
 | AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
-| Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
+| Optimum Nvidia               | 100.42 ± 0.03| 99.81 ± 1.76   |      -        |      -        |
 | Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
 
-*(Data updated: `29th January 2024`)
+*(Data updated: `31th January 2024`)
 
 
 
diff --git a/docs/llama2.md b/docs/llama2.md
index 0277640a..00a941f1 100644
--- a/docs/llama2.md
+++ b/docs/llama2.md
@@ -25,10 +25,10 @@
 | AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
-| Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
+| Optimum Nvidia               | 100.42 ± 0.03| 99.81 ± 1.76   |      -        |      -        |
 | Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
 
-*(Data updated: `29th January 2024`)
+*(Data updated: `31th January 2024`)
 
 
 ## M2 MAX 32GB Inference Bench:
@@ -72,4 +72,4 @@
 | exllamav2             |      -       |      -        |      -       |      -       |
 | vllm                  |      -       |      -        |      -       |      -       |
 
-*(Data updated: `29th January 2024`)
+*(Data updated: `31th January 2024`)

From cd8f4ca5405420371fd6e46f4f8447fa520d3e4d Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 22:26:52 +0530
Subject: [PATCH 49/59] Update bench_tinygrad/bench.sh

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_tinygrad/bench.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_tinygrad/bench.sh b/bench_tinygrad/bench.sh
index 66d7b1bf..17f2309f 100755
--- a/bench_tinygrad/bench.sh
+++ b/bench_tinygrad/bench.sh
@@ -2,7 +2,7 @@
 
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks PyTorch-Transformers llama benchmark.
+# Description: This script runs benchmarks TinyGrad llama benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:

From 03e78e15e7da84b4420cdd7fb0fb20f255944c07 Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 17:08:39 +0000
Subject: [PATCH 50/59] Added latest reproducibility status of the benchmark

---
 bench_tinygrad/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bench_tinygrad/README.md b/bench_tinygrad/README.md
index ca9e9fde..bbc1d5ee 100644
--- a/bench_tinygrad/README.md
+++ b/bench_tinygrad/README.md
@@ -31,3 +31,4 @@ This will take all the default values (see in the [bench.sh](/bench_tinygrad/ben
 
 1. The current implementation of TinyGrad only supports Float16 for CUDA, CPU and Metal.
 2. This benchmark implementation expects the Raw Llama 2 weights from Meta AI to run LLama2 Model. So it assumes that you already accepted all the [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) before running it.
+3. Please note, the current implementation won't work if tried to reproduce. There are certain conflicts with the main tinygrad repo. This will be fixed in the upcoming versions.

From f51376aeae2c8f651768e1cb3ac6c7ba88c4a73a Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 22:40:42 +0530
Subject: [PATCH 51/59] Update bench_candle/README.md

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_candle/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_candle/README.md b/bench_candle/README.md
index 48286462..e6218a9e 100644
--- a/bench_candle/README.md
+++ b/bench_candle/README.md
@@ -2,7 +2,7 @@
 
 [![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/huggingface/candle) &nbsp;
 
-[Candle](https://github.com/huggingface/candle) is a minimalistic Machine /Deep Learning framework written on Rust by [huggingface](https://github.com/huggingface). It tries to provide a simpler interface to implement models along with GPU support. This is a modified implementation of [Llama2-Candle example](https://github.com/huggingface/candle/blob/main/candle-examples/examples/llama/main.rs) to analyse the benchmark performance across different devices and precision.
+[Candle](https://github.com/huggingface/candle) is a minimalistic Machine/Deep Learning framework written on Rust by [huggingface](https://github.com/huggingface). It tries to provide a simpler interface to implement models along with GPU support. This is a modified implementation of [Llama2-Candle example](https://github.com/huggingface/candle/blob/main/candle-examples/examples/llama/main.rs) to analyse the benchmark performance across different devices and precision.
 
 
 ### 🚀 Running the Candle Benchmark.

From 3cbac9880fdfd1edc08b412c0962269d63299758 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Wed, 31 Jan 2024 17:11:22 +0000
Subject: [PATCH 52/59] Update <LAST_UPDATE> placeholder in llama2.md and
 README.md

---
 README.md      | 4 ++--
 docs/llama2.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 5ef3d942..924becfe 100644
--- a/README.md
+++ b/README.md
@@ -45,8 +45,8 @@ Take a first glance of Llama-2-7B Model Performance Metrics Across Different Pre
 | AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
-| Optimum Nvidia               | 100.42 ± 0.03| 99.81 ± 1.76   |      -        |      -        |
-| Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
+| Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
+| Nvidia TensorRT-LLM          | 55.19 ± 1.03 | 85.03 ± 0.62   | 167.66 ± 2.05 | 235.18 ± 3.20 |
 
 *(Data updated: `31th January 2024`)
 
diff --git a/docs/llama2.md b/docs/llama2.md
index 00a941f1..759072da 100644
--- a/docs/llama2.md
+++ b/docs/llama2.md
@@ -25,8 +25,8 @@
 | AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
 | DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
-| Optimum Nvidia               | 100.42 ± 0.03| 99.81 ± 1.76   |      -        |      -        |
-| Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
+| Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
+| Nvidia TensorRT-LLM          | 55.19 ± 1.03 | 85.03 ± 0.62   | 167.66 ± 2.05 | 235.18 ± 3.20 |
 
 *(Data updated: `31th January 2024`)
 

From 358f63e694d460e9c1bea36e5a5b079d0900a2db Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 31 Jan 2024 23:07:54 +0530
Subject: [PATCH 53/59] Added setup procedure for burn

---
 bench_burn/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_burn/README.md b/bench_burn/README.md
index b1978a3b..fbb44e71 100644
--- a/bench_burn/README.md
+++ b/bench_burn/README.md
@@ -29,7 +29,7 @@ This will take all the default values (see in the [bench.sh](/bench_burn/bench.s
 
 ### 👀 Some points to note:
 
-1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
+1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights. This weights are dumped and converted to binaries compatible with Burn. You can find the conversion process in [setup.sh](/bench_burn/setup.sh).
 2. The current implementation of Llama2-Burn only supports Float32 precision for CUDA and CPU.
 3. The current implementation of Llama2-Burn does not support Metal.
 4. The current implementation of Llama2-Burn does not support INT-4/8 precision quantized models.

From 33818f12af166a9c3c7c43a7fe58895820098cf7 Mon Sep 17 00:00:00 2001
From: GitHub Actions <actions@github.com>
Date: Wed, 31 Jan 2024 17:42:44 +0000
Subject: [PATCH 54/59] Update <LAST_UPDATE> placeholder in llama2.md and
 README.md

---
 README.md      | 2 +-
 docs/llama2.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 924becfe..510fffd0 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ Take a first glance of Llama-2-7B Model Performance Metrics Across Different Pre
 
 | Engine                       | float32      | float16        | int8          | int4          |
 |------------------------------|--------------|----------------|---------------|---------------|
-| burn                         | 13.12 ± 0.85 |      -         |      -        |      -        |
+| burn                         | 10.04 ± 0.64 |      -         |      -        |      -        |
 | candle                       |      -       | 36.78 ± 2.17   |      -        |      -        |
 | llama.cpp                    |      -       |      -         | 79.15 ± 1.20  | 100.90 ± 1.46 |
 | ctranslate                   | 35.23 ± 4.01 | 55.72 ± 16.66  | 35.73 ± 10.87 |      -        |
diff --git a/docs/llama2.md b/docs/llama2.md
index 759072da..19067d94 100644
--- a/docs/llama2.md
+++ b/docs/llama2.md
@@ -11,7 +11,7 @@
 
 | Engine                       | float32      | float16        | int8          | int4          |
 |------------------------------|--------------|----------------|---------------|---------------|
-| burn                         | 13.12 ± 0.85 |      -         |      -        |      -        |
+| burn                         | 10.04 ± 0.64 |      -         |      -        |      -        |
 | candle                       |      -       | 36.78 ± 2.17   |      -        |      -        |
 | llama.cpp                    |      -       |      -         | 79.15 ± 1.20  | 100.90 ± 1.46 |
 | ctranslate                   | 35.23 ± 4.01 | 55.72 ± 16.66  | 35.73 ± 10.87 |      -        |
@@ -43,7 +43,7 @@
 **Performance Metrics:** (unit: Tokens / second)
 | Engine                | float32      | float16      | int8         | int4         |
 |-----------------------|--------------|--------------|--------------|--------------|
-| burn                  | 0.30 ± 0.09  |      -       |      -       |      -       |
+| burn                  | 0.21 ± 0.12  |      -       |      -       |      -       |
 | candle                |      -       | 3.43 ± 0.02  |      -       |      -       |
 | llama.cpp             |      -       |      -       | 13.24 ± 0.62 | 21.43 ± 0.47 |
 | ctranslate            |      -       |      -       | 1.87 ± 0.14  |      -       |

From d5d8572c801fe43c1a0601224a36924be9538c38 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 31 Jan 2024 23:13:56 +0530
Subject: [PATCH 55/59] Added quantized info for candle

---
 bench_candle/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bench_candle/README.md b/bench_candle/README.md
index e6218a9e..a30206ff 100644
--- a/bench_candle/README.md
+++ b/bench_candle/README.md
@@ -31,3 +31,4 @@ This will take all the default values (see in the [bench.sh](/bench_candle/bench
 
 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
 2. Candle does not have support for Metal devices.
+3. Candles does support [quantized models](https://github.com/huggingface/candle/blob/main/candle-examples/examples/quantized/main.rs). The benchmarks for quantized candles model will be available in the next versions.

From 6c2de481d874aac4664d77316fa31001a29fbccb Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 31 Jan 2024 23:34:20 +0530
Subject: [PATCH 56/59] Added benchmark reproducibility info in readme

---
 bench_onnxruntime/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bench_onnxruntime/README.md b/bench_onnxruntime/README.md
index 3755ae46..f7a35543 100644
--- a/bench_onnxruntime/README.md
+++ b/bench_onnxruntime/README.md
@@ -32,3 +32,4 @@ This will take all the default values (see in the [bench.sh](/bench_onnxruntime/
 1. ONNX Runtime requires HuggingFace Llama2-7B weights. And it converts those weights into ONNX format using this [setup.sh](/bench_onnxruntime/setup.sh) script. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
 2. ONNX Runtime GPU only support Float16 precision format.
 3. Running LLama 2 using ONNX Runtime in CPU/Metal is too memory intensive, so benchmarking is skipped for those.
+4. Please note that, you might not be able to fix the current implementation of onnx benchmark. Since it requires some specific depedencies and anaconda support. Fix will be reflected in the upcoming versions.

From 1dbd21f46b13acd8fc36a181b740a7604dd7a0d2 Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 31 Jan 2024 23:34:51 +0530
Subject: [PATCH 57/59] Added log in folder support, python alisas support and
 latest cli args

---
 bench_onnxruntime/bench.sh | 58 ++++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/bench_onnxruntime/bench.sh b/bench_onnxruntime/bench.sh
index c90ca7a7..5b3a2894 100755
--- a/bench_onnxruntime/bench.sh
+++ b/bench_onnxruntime/bench.sh
@@ -2,14 +2,14 @@
 
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks onnxruntime llama benchmark.
+# Description: This script runs benchmarks ONNX Runtime Llama-2 benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
@@ -17,21 +17,21 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
-    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
     exit 1
 }
-
 check_cuda() {
     if command -v nvcc &> /dev/null
     then
@@ -57,16 +57,29 @@ check_platform() {
 }
 
 check_python() {
-    if command -v python &> /dev/null
-    then
-        echo -e "\nUsing $(python --version)."
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
     else
-        echo -e "\nPython does not exist."
+        echo "Python is not installed."
         exit 1
     fi
 }
 
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh "$1"
 }
@@ -82,7 +95,7 @@ run_benchmarks() {
     # shellcheck disable=SC1091
     source "$SCRIPT_DIR/venv/bin/activate"
 
-    python "$SCRIPT_DIR"/bench.py \
+    "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
         --prompt "$PROMPT" \
         --repetitions "$REPETITIONS" \
         --max_tokens "$MAX_TOKENS" \
@@ -147,15 +160,18 @@ while [ "$#" -gt 0 ]; do
             ;;
     esac
 done
-# Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
-REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
+
 MODELS_DIR="${MODELS_DIR:-"./models"}"
 
 check_platform
 check_python
 setup "$MODELS_DIR"
+
+# Set default values if not provided
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
+REPETITIONS="${REPETITIONS:-10}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_onnx_$(date +'%Y%m%d%H%M%S').log"}"
+
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"

From 4c9200c4c5ce772cb2e2fe013edd5c461348d9fe Mon Sep 17 00:00:00 2001
From: Anindyadeep <anindyadeep@pop-os.localdomain>
Date: Wed, 31 Jan 2024 23:35:15 +0530
Subject: [PATCH 58/59] python aliases support in setup

---
 bench_onnxruntime/setup.sh | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/bench_onnxruntime/setup.sh b/bench_onnxruntime/setup.sh
index d33a443b..2415fbd6 100755
--- a/bench_onnxruntime/setup.sh
+++ b/bench_onnxruntime/setup.sh
@@ -13,6 +13,17 @@ if [ "$#" -ne 1 ]; then
     exit 1
 fi
 
+check_python() {
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
+    else
+        echo "Python is not installed."
+        exit 1
+    fi
+}
+
 # Define directory paths
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VENV_DIR="$SCRIPT_DIR/venv"
@@ -20,8 +31,10 @@ MODELS_FOLDER="$1"
 LLAMA_HF_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-hf"
 LLAMA_ONNX_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-onnx"
 
+check_python
+
 if [ ! -d "$VENV_DIR" ]; then
-    python -m venv "$VENV_DIR"
+    "$PYTHON_CMD" -m venv "$VENV_DIR"
     echo "Virtual environment '$VENV_DIR' created."
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"

From f83f6c82f61c4fc737add292cf34a70ea03abe1e Mon Sep 17 00:00:00 2001
From: Anindyadeep <proanindyadeep@gmail.com>
Date: Wed, 31 Jan 2024 23:40:51 +0530
Subject: [PATCH 59/59] Update bench_onnxruntime/README.md

Co-authored-by: Nicola Sosio <sosio.nicola94@tiscali.it>
---
 bench_onnxruntime/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench_onnxruntime/README.md b/bench_onnxruntime/README.md
index f7a35543..af9c4b9b 100644
--- a/bench_onnxruntime/README.md
+++ b/bench_onnxruntime/README.md
@@ -32,4 +32,4 @@ This will take all the default values (see in the [bench.sh](/bench_onnxruntime/
 1. ONNX Runtime requires HuggingFace Llama2-7B weights. And it converts those weights into ONNX format using this [setup.sh](/bench_onnxruntime/setup.sh) script. So running this benchmark would assume that you already agree to the required terms and conditions and verified to download the weights.
 2. ONNX Runtime GPU only support Float16 precision format.
 3. Running LLama 2 using ONNX Runtime in CPU/Metal is too memory intensive, so benchmarking is skipped for those.
-4. Please note that, you might not be able to fix the current implementation of onnx benchmark. Since it requires some specific depedencies and anaconda support. Fix will be reflected in the upcoming versions.
+4. Please note that you might not be able to run the current implementation of onnx benchmark. Since it requires some specific dependencies and anaconda support. A fix will be reflected in the upcoming versions.