diff --git a/README.md b/README.md
index ac60881a..f1f30473 100644
--- a/README.md
+++ b/README.md
@@ -38,12 +38,12 @@ Take a first glance of Llama-2-7B Model Performance Metrics Across Different Pre
 | tinygrad                     |      -       | 20.32 ± 0.06   |      -        |      -        |
 | onnx                         |      -       | 54.16 ± 3.15   |      -        |      -        |
 | transformers (pytorch)       | 43.79 ± 0.61 | 46.39 ± 0.28   | 6.98 ± 0.05   | 21.72 ± 0.11  |
-| vllm                         | 91.23 ± 3.61 | 91.44 ± 3.83   |      -        | 113.38 ± 11.70|
-| exllamav2                    |      -       |      -         | 116.91 ± 1.73 | 164.28 ± 4.07 |
+| vllm                         | 90.78 ± 1.60 | 90.54 ± 2.22   |      -        |      -        |
+| exllamav2                    |      -       |      -         | 121.63 ± 0.74 | 130.16 ± 0.35 |
 | ctransformers                |      -       |      -         | 76.75 ± 10.36 | 84.26 ± 5.79  |
 | AutoGPTQ                     | 42.01 ± 1.03 | 30.24 ± 0.41   |      -        |      -        |
-| AutoAWQ                      |      -       |      -         |      -        | 116.94 ± 13.14|
-| DeepSpeed                    |      -       | 83.54 ± 5.25   |      -        |               |
+| AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
+| DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
 | Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
diff --git a/bench_autoawq/README.md b/bench_autoawq/README.md
new file mode 100644
index 00000000..1f556ed2
--- /dev/null
+++ b/bench_autoawq/README.md
@@ -0,0 +1,36 @@
+# AutoAWQ
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/casper-hansen/AutoAWQ) &nbsp;
+[![ArXiv](https://img.shields.io/badge/arXiv-%230170FE.svg?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2306.00978)
+
+
+[AutoAWQ](https://github.com/casper-hansen/AutoAWQ) is a package that is a polished implemementation of the original work [llm-awq](https://github.com/mit-han-lab/llm-awq) from MIT. AWQ or Activation Aware Quantization is a quantization method which supports 4-bit quantization. It massively increases the inference throughput and decreases the memory requirement of the model at the same time. (For example, according to this [reference](https://huggingface.co/TheBloke/Llama-2-70B-Chat-AWQ), Llama2 70B requires 2 x 80 GB but with AutoAWQ it can be run on 1 x 48 GB GPU). You can learn more about AWQ on the research paper and the github implementations.
+
+### 🚀 Running the AutoAWQ Benchmark.
+
+You can run the AutoAWQ benchmark using the following command:
+
+```bash
+./bench_autoawq/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which AWQ model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_autoawq/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_autoawq/bench.sh) file) and do the benchmarks. You can find all the benchmarks results for AutoAWQ [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. AutoAWQ is not supported devices other than GPU (only supports when CUDA is available).
+2. We are independently benchmarking AutoAWQ (i.e. the actual AWQ quantization method here). We are not benchmarking with combinations like: AutoAWQ + VLLM or AutoAWQ + TensorRT.
+3. For doing this benchmark, the default model that was choosen was: [Llama2-AutoAWQ by The Bloke](https://huggingface.co/TheBloke/Llama-2-7B-AWQ)
+4. AutoAWQ does not support INT8 quantization properly yet. See [this issue](https://github.com/casper-hansen/AutoAWQ/issues/45).
diff --git a/bench_autoawq/bench.py b/bench_autoawq/bench.py
index d57d5b4b..17fb6a66 100644
--- a/bench_autoawq/bench.py
+++ b/bench_autoawq/bench.py
@@ -19,10 +19,8 @@
 
 class LlamaAutoAWQBenchmark:
     def __init__(self, model_path: str, precision: int, device: str) -> None:
-        assert precision in ["fp16"], "For benchmarks supported precision is in FP16."
-        assert (
-            device == "cuda"
-        ), "Since it's an optimization for FP-16, CPU not supported."
+        assert device == "cuda", "Device other than CUDA is not supported for autoawq."
+        assert precision == "int4", "Precison other than INT4 is not supported."
 
         self.model_path, self.precision, self.device = (
             model_path,
@@ -99,11 +97,10 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
     )
     report = defaultdict(lambda: defaultdict(float))
 
-    # Hardcoding precision to fp16 for AutoAWQ
-    precision = 16
+    precision = 4
 
     if args.device == "cpu":
-        logging.info("Skipping running model on fp16 on CPU, not implemented for Half")
+        logging.info("Skipping running model on int4 on CPU, not implemented for Half")
         pass
     else:
         logging.info(
@@ -112,7 +109,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
         llama_autogptq_benchmark = LlamaAutoAWQBenchmark(
             model_path=f"{args.models_dir}/llama-2-7b-autoawq",
             device=args.device,
-            precision=f"fp{precision}",
+            precision=f"int{precision}",
         ).load_model()
         llama_autogptq_benchmark.benchmark(
             max_tokens=args.max_tokens,
@@ -120,7 +117,7 @@ def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None:
             repetitions=args.repetitions,
         )
 
-        report["Llama AutoAWQ"][f"FP-{precision}"] = {
+        report["Llama AutoAWQ"][f"INT-{precision}"] = {
             "mean": np.mean(llama_autogptq_benchmark.results),
             "std": np.std(llama_autogptq_benchmark.results),
         }
diff --git a/bench_autoawq/bench.sh b/bench_autoawq/bench.sh
index 3c977e61..41c3c11f 100755
--- a/bench_autoawq/bench.sh
+++ b/bench_autoawq/bench.sh
@@ -6,10 +6,10 @@
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
@@ -17,15 +17,16 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
-    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
@@ -69,6 +70,18 @@ check_python() {
 
 
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh
 }
@@ -142,15 +155,16 @@ while [ "$#" -gt 0 ]; do
     esac
 done
 
+check_platform
+check_python
+setup
+
 # Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
 REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_autoawq_$(date +'%Y%m%d%H%M%S').log"}"
 MODELS_DIR="${MODELS_DIR:-"./models"}"
 
-check_platform
-check_python
-setup
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
diff --git a/bench_exllamav2/README.md b/bench_exllamav2/README.md
new file mode 100644
index 00000000..9c912591
--- /dev/null
+++ b/bench_exllamav2/README.md
@@ -0,0 +1,34 @@
+# ExLlamaV2
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/turboderp/exllamav2) &nbsp;
+
+[ExLlamaV2](https://github.com/turboderp/exllamav2) uses custom Kernels to speed up LLM inference under different quantizations. ExLlamaV2 supports a new "EXL2" format. EXL2 is based on the same optimization method as GPTQ and supports 2, 3, 4, 5, 6 and 8-bit quantization. For this benchmark implementation, we use 4-bit and 8-bit quantization version of Llama2.
+
+
+### 🚀 Running the ExLlamaV2 Benchmark.
+
+You can run the ExLlamaV2 benchmark using the following command:
+
+```bash
+./bench_exllamav2/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_exllamav2/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_exllamav2/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for ExLlamaV2 [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. ExLlamaV2 supports quantized LLMs. So Float32/16 is not supported here.
+2. ExLlamaV2 currently [does not have support](https://github.com/turboderp/exllamav2/issues/184) for Mac/Metal.
+3. Although it supports CPU, but it is too slow to offload and run. So we did not include in our benchmarks.
diff --git a/bench_exllamav2/bench.py b/bench_exllamav2/bench.py
index da44e370..3e5029bb 100644
--- a/bench_exllamav2/bench.py
+++ b/bench_exllamav2/bench.py
@@ -3,12 +3,13 @@
 import sys
 import time
 from collections import defaultdict
-from dataclasses import dataclass
 
 import numpy as np
 import torch
-from exllamav2 import ExLlamaV2Cache, model_init
+from exllamav2 import ExLlamaV2, ExLlamaV2Cache
+from exllamav2.config import ExLlamaV2Config
 from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
+from exllamav2.tokenizer import ExLlamaV2Tokenizer
 
 logging.getLogger("llama_cpp").setLevel(logging.ERROR)
 logging.basicConfig(
@@ -18,39 +19,26 @@
 )
 
 
-@dataclass
-class ExtraConfig:
-    model_dir: str
-    length: int = 2048
-    rope_scale: float = 1.0
-    rope_alpha: float = 1.0
-    no_flash_attn: bool = False
-    low_mem: bool = False
-    gpu_split: str = None
-
-
 class ExllamaV2Benchmark:
     def __init__(self, model_path: str) -> None:
-        self.model_path = model_path
-        self.cache = None
-        self.results = []
+        self.model_path, self.results = model_path, []
 
     def load_model(self):
-        self.model, self.tokenizer = model_init.init(
-            ExtraConfig(model_dir=self.model_path), allow_auto_split=True
-        )
+        self.config = ExLlamaV2Config()
+        self.config.model_dir = self.model_path
+        self.config.prepare()
+
+        self.model = ExLlamaV2(self.config)
+        self.cache = ExLlamaV2Cache(self.model, lazy=True)
+        self.model.load_autosplit(self.cache)
+        self.tokenizer = ExLlamaV2Tokenizer(self.config)
+
+        self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
         self.settings = ExLlamaV2Sampler.Settings()
         self.settings.temperature = 0.85
         self.settings.top_k = 50
         self.settings.top_p = 0.8
-        self.settings.token_repetition_penalty = 1.15
-
-        if not self.model.loaded:
-            self.cache = ExLlamaV2Cache(self.model)
-            self.model.load_autosplit(self.cache)
-            self.cache = None
-        self.cache = ExLlamaV2Cache(self.model)
-        self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer)
+        self.settings.token_repetition_penalty = 1.05
         self.settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
         self.generator.warmup()
         return self
@@ -58,9 +46,7 @@ def load_model(self):
     @torch.inference_mode()
     def run_model(self, prompt: str, max_tokens: int) -> float:
         start = time.time()
-        _ = self.generator.generate_simple(
-            prompt, self.settings, max_tokens, token_healing=True
-        )
+        _ = self.generator.generate_simple(prompt, self.settings, max_tokens, seed=1234)
         delta = time.time() - start
         return len(self.generator.sequence_ids[0]) / delta
 
diff --git a/bench_exllamav2/bench.sh b/bench_exllamav2/bench.sh
index 8c179741..33c56fdc 100755
--- a/bench_exllamav2/bench.sh
+++ b/bench_exllamav2/bench.sh
@@ -1,17 +1,15 @@
 #!/bin/bash
 
-#!/bin/bash
-
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks llama.cpp llama benchmark.
+# Description: This script runs benchmarks Exllamav2 Llama-2 benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
@@ -19,15 +17,16 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
-    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
@@ -59,16 +58,29 @@ check_platform() {
 }
 
 check_python() {
-    if command -v python &> /dev/null
-    then
-        echo -e "\nUsing $(python --version)."
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
     else
-        echo -e "\nPython does not exist."
+        echo "Python is not installed."
         exit 1
     fi
 }
 
 setup() {
+
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh
 }
@@ -83,7 +95,7 @@ run_benchmarks() {
 
     # shellcheck disable=SC1091
     source "$SCRIPT_DIR/venv/bin/activate"
-    python "$SCRIPT_DIR"/bench.py \
+    "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
         --prompt "$PROMPT" \
         --repetitions "$REPETITIONS" \
         --max_tokens "$MAX_TOKENS" \
@@ -143,17 +155,16 @@ while [ "$#" -gt 0 ]; do
     esac
 done
 
+check_platform
+check_python
+setup
 
 # Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
 REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
 DEVICE="${DEVICE:-'cuda'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
+LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_exllamav2_$(date +'%Y%m%d%H%M%S').log"}"
 MODELS_DIR="${MODELS_DIR:-"./models"}"
 
-check_platform
-check_cuda
-check_python
-setup
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
diff --git a/bench_exllamav2/requirements.txt b/bench_exllamav2/requirements.txt
index 7aea264d..eae6bc03 100644
--- a/bench_exllamav2/requirements.txt
+++ b/bench_exllamav2/requirements.txt
@@ -1 +1,2 @@
 exllamav2
+tqdm
diff --git a/bench_exllamav2/setup.sh b/bench_exllamav2/setup.sh
index ee872df8..cebc3787 100755
--- a/bench_exllamav2/setup.sh
+++ b/bench_exllamav2/setup.sh
@@ -6,12 +6,23 @@
 # requirements.
 ################################################################################
 
+check_python() {
+    if command -v python &> /dev/null; then
+        PYTHON_CMD="python"
+    elif command -v python3 &> /dev/null; then
+        PYTHON_CMD="python3"
+    else
+        echo "Python is not installed."
+        exit 1
+    fi
+}
+
 convert_bin_to_safetensor() {
     local HF_MODEL_FOLDER_PATH="$1"
 
     # shellcheck disable=SC1091
     source "$SCRIPT_DIR/venv/bin/activate"
-    python "$SCRIPT_DIR"/convert.py \
+    "$PYTHON_CMD" "$SCRIPT_DIR"/convert.py \
         "$HF_MODEL_FOLDER_PATH"
 }
 
@@ -41,7 +52,7 @@ convert_safetensor_to_exllamav2() {
     else
         mkdir -p "$EXLLAMA_WEIGHTS_FOLDER"
         echo "Going for conversion to exllamav2 format from .safetensors in $QUANTIZATION bit quantization."
-        python "$SCRIPT_DIR/exllamav2/convert.py" \
+        "$PYTHON_CMD" "$SCRIPT_DIR/exllamav2/convert.py" \
         -i "$HF_WEIGHTS_FOLDER" \
         -o "$EXLLAMA_WEIGHTS_FOLDER" \
         -c "$SCRIPT_DIR/wikitext-test.parquet" \
@@ -54,6 +65,8 @@ convert_safetensor_to_exllamav2() {
 }
 
 
+check_python
+
 CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
@@ -62,7 +75,7 @@ MODELS_DIR="${MODELS_DIR:-"models/llama-2-7b-hf"}"
 EXLLAMA_BASE_MODEL_DIR="${EXLLAMA_BASE_MODEL_DIR:-"./models/llama-2-7b-exllamav2"}"
 
 if [ ! -d "$VENV_DIR" ]; then
-    python -m venv "$VENV_DIR"
+    "$PYTHON_CMD" -m venv "$VENV_DIR"
     echo "Virtual environment '$VENV_DIR' created."
     # shellcheck disable=SC1091
     source "$VENV_DIR/bin/activate"
diff --git a/bench_lightning/README.md b/bench_lightning/README.md
new file mode 100644
index 00000000..ce70d8bd
--- /dev/null
+++ b/bench_lightning/README.md
@@ -0,0 +1,35 @@
+# Lightning
+
+[![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Lightning-AI/lit-gpt) &nbsp;
+
+[Lit-GPT](https://github.com/Lightning-AI/lit-gpt) is a hackable implementation of [different Open Source LLMs](https://github.com/Lightning-AI/lit-gpt?tab=readme-ov-file#-lit-gpt-1). Lit-GPT is written using the [Lightning Fabric](https://lightning.ai/docs/fabric/stable/) framework. Lightning Fabric is a fast and lightweight way to scale PyTorch models. It comes with features that enables to do distributed training and inference with ease. Lightning Fabric is based on [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/starter/introduction.html).
+
+
+### 🚀 Running the Lightning Benchmark.
+
+You can run the Lightning benchmark using the following command:
+
+```bash
+./bench_lightning/bench.sh \
+  --prompt <value> \            # Enter a prompt string
+  --max_tokens <value> \        # Maximum number of tokens to output
+  --repetitions <value> \       # Number of repititions to be made for the prompt.
+  --log_file <file_path> \      # A .log file underwhich we want to write the results.
+  --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
+  --models_dir <path_to_models> # The directory in which model weights are present
+```
+
+To get started quickly you can simply run:
+
+```bash
+./bench_lightning/bench.sh -d cuda
+```
+This will take all the default values (see in the [bench.sh](/bench_lightning/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for Lightning [here](/docs/llama2.md).
+
+
+### 👀 Some points to note:
+
+1. This implementation runs Llama-2-7B models. Lit-GPT model implementation requires converting HuggingFace models to lit-gpt formats. The model conversion can be found in the [setup.sh](/bench_lightning/setup.sh) file.
+2. Since, running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So we would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
+3. When running it on Metal devices, it runs out of memory. Therefore we were not able to do inference on Metal.
+4. For CPU, it is just too much slow (> 1 minute). Hence we skipped the benchmarking for CPUs.
diff --git a/bench_lightning/bench.sh b/bench_lightning/bench.sh
index d9ace574..359a1d4a 100755
--- a/bench_lightning/bench.sh
+++ b/bench_lightning/bench.sh
@@ -2,14 +2,14 @@
 
 ########################################################################################################
 # Script: bench.sh
-# Description: This script runs benchmarks llama.cpp llama benchmark.
+# Description: This script runs benchmarks LightningAI Lit-GPT llama benchmark.
 #
 # Usage: ./bench.sh [OPTIONS]
 # OPTIONS:
-#   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
-#   -r, --repetitions Number of repetitions for benchmarks (default: 2)
-#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
-#   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
+#   -p, --prompt      Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
+#   -r, --repetitions Number of repetitions for benchmarks (default: 10)
+#   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 512)
+#   -d, --device      Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 #   -lf, --log_file   Logging file name.
 #   -md, --models_dir Models directory.
 #   -h, --help        Show this help message
@@ -17,15 +17,16 @@
 
 set -euo pipefail
 
+CURRENT_DIR="$(pwd)"
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 print_usage() {
     echo "Usage: $0 [OPTIONS]"
     echo "OPTIONS:"
-    echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
-    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
-    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
-    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
+    echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
+    echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
+    echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
+    echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
     echo "  -lf, --log_file     Logging file name."
     echo "  -md, --models_dir   Models directory."
     echo "  -h, --help          Show this help message"
@@ -69,6 +70,17 @@ check_python() {
 
 
 setup() {
+    # Check if Logs folder exists else Make the logs folder
+    LOGS_FOLDER="$CURRENT_DIR/Logs"
+
+    if [ -d "$LOGS_FOLDER" ]; then
+        echo "Folder '$LOGS_FOLDER' already exists. Skipping."
+    else
+        # Create the folder
+        mkdir "$LOGS_FOLDER"
+        echo "'$LOGS_FOLDER' created."
+    fi
+
     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
     bash "$SCRIPT_DIR"/setup.sh
 }
@@ -141,15 +153,16 @@ while [ "$#" -gt 0 ]; do
     esac
 done
 
+check_platform
+check_python
+setup
+
 # Set default values if not provided
-PROMPT="${PROMPT:-"Explain what is a transformer"}"
+PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
 REPETITIONS="${REPETITIONS:-10}"
-MAX_TOKENS="${MAX_TOKENS:-100}"
-DEVICE="${DEVICE:-'cpu'}"
-LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
+MAX_TOKENS="${MAX_TOKENS:-512}"
+DEVICE="${DEVICE:-'cuda'}"
+LOG_FILENAME="${LOG_FILENAME:-"$LOGS_FOLDER/benchmark_lightning_$(date +'%Y%m%d%H%M%S').log"}"
 MODELS_DIR="${MODELS_DIR:-"./models"}"
 
-check_platform
-check_python
-setup
 run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
diff --git a/docs/llama2.md b/docs/llama2.md
index a5ac2fa1..0277640a 100644
--- a/docs/llama2.md
+++ b/docs/llama2.md
@@ -18,12 +18,12 @@
 | tinygrad                     |      -       | 20.32 ± 0.06   |      -        |      -        |
 | onnx                         |      -       | 54.16 ± 3.15   |      -        |      -        |
 | transformers (pytorch)       | 43.79 ± 0.61 | 46.39 ± 0.28   | 6.98 ± 0.05   | 21.72 ± 0.11  |
-| vllm                         | 91.23 ± 3.61 | 91.44 ± 3.83   |      -        | 113.38 ± 11.70|
-| exllamav2                    |      -       |      -         | 116.91 ± 1.73 | 164.28 ± 4.07 |
+| vllm                         | 90.78 ± 1.60 | 90.54 ± 2.22   |      -        |      -        |
+| exllamav2                    |      -       |      -         | 121.63 ± 0.74 | 130.16 ± 0.35 |
 | ctransformers                |      -       |      -         | 76.75 ± 10.36 | 84.26 ± 5.79  |
 | AutoGPTQ                     | 42.01 ± 1.03 | 30.24 ± 0.41   |      -        |      -        |
-| AutoAWQ                      |      -       |      -         |      -        | 116.94 ± 13.14|
-| DeepSpeed                    |      -       | 83.54 ± 5.25   |      -        |               |
+| AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
+| DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
 | Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |
diff --git a/docs/llama2.md.template b/docs/llama2.md.template
index 1c1f6950..bd99774a 100644
--- a/docs/llama2.md.template
+++ b/docs/llama2.md.template
@@ -18,12 +18,12 @@
 | tinygrad                     |      -       | 20.32 ± 0.06   |      -        |      -        |
 | onnx                         |      -       | 54.16 ± 3.15   |      -        |      -        |
 | transformers (pytorch)       | 43.79 ± 0.61 | 46.39 ± 0.28   | 6.98 ± 0.05   | 21.72 ± 0.11  |
-| vllm                         | 91.23 ± 3.61 | 91.44 ± 3.83   |      -        | 113.38 ± 11.70|
-| exllamav2                    |      -       |      -         | 116.91 ± 1.73 | 164.28 ± 4.07 |
+| vllm                         | 90.78 ± 1.60 | 90.54 ± 2.22   |      -        |      -        |
+| exllamav2                    |      -       |      -         | 121.63 ± 0.74 | 130.16 ± 0.35 |
 | ctransformers                |      -       |      -         | 76.75 ± 10.36 | 84.26 ± 5.79  |
 | AutoGPTQ                     | 42.01 ± 1.03 | 30.24 ± 0.41   |      -        |      -        |
-| AutoAWQ                      |      -       |      -         |      -        | 116.94 ± 13.14|
-| DeepSpeed                    |      -       | 83.54 ± 5.25   |      -        |               |
+| AutoAWQ                      |      -       |      -         |      -        | 109.20 ± 3.28 |
+| DeepSpeed                    |      -       | 81.44 ± 8.13   |      -        |               |
 | PyTorch Lightning            | 24.85 ± 0.07 | 44.56 ± 2.89   | 10.50 ± 0.12  | 24.83 ± 0.05  |
 | Optimum Nvidia               | 110.36 ± 0.52| 109.09 ± 4.26  |      -        |      -        |
 | Nvidia TensorRT-LLM          | 60.39 ± 0.62 | 101.94 ± 8.34  |      -        |      -        |