redhat-et · Mar 20, 2025
diff --git a/‎examples/flash_attention_demo.ipynb
+366-321 b/‎examples/flash_attention_demo.ipynb
+366-321
@@ -1,322 +1,367 @@
 {
-    "cells": [
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "29482448-d69e-4412-82b8-e6b8243699fe",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "!python triton-gpu-check.py"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "591e3d51-8a52-45b5-b6a8-99acd0b1180c",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "!cd triton && pip install ./python && cd -"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "2278ff2f",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "import torch\n",
-       "import triton\n",
-       "import triton.language as tl\n",
-       "import matplotlib.pyplot as plt\n",
-       "import time\n",
-       "\n",
-       "print(\"Torch version:\", torch.__version__)\n",
-       "print(\"Triton version:\", triton.__version__)"
-      ]
-     },
-     {
-      "cell_type": "markdown",
-      "id": "75bc884d",
-      "metadata": {},
-      "source": [
-       "## Flash Attention Benchmark (PyTorch SDPA vs vLLM Kernel)\n",
-       "This notebook benchmarks the PyTorch `scaled_dot_product_attention` against the vLLM Triton-based flash attention kernel."
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "vllm-import",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# Assuming vllm_flash_attention.py is present in the same directory or accessible path\n",
-       "from flash_attention import triton_attention as vllm_flash_attention\n",
-       "from flash_attention import benchmark_flash_attention as vllm_benchmark\n"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "e4958854-eb3a-44d8-a52e-c4f77cefce94",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "!ls /workspace/.triton/cache"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "bc0bbe33",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def run_pytorch_sdpa(q, k, v):\n",
-       "    return torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "8ac54ef2-2877-49bb-aca6-c7ca2e6d55d2",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "!ls /workspace/.triton/cache"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "vllm-kernel-wrapper",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def run_vllm_flash_attention(q, k, v, seqlen):\n",
-       "    q_flat = q.permute(0, 2, 1, 3).reshape(-1, q.shape[1], q.shape[3])\n",
-       "    k_flat = k.permute(0, 2, 1, 3).reshape(-1, k.shape[1], k.shape[3])\n",
-       "    v_flat = v.permute(0, 2, 1, 3).reshape(-1, v.shape[1], v.shape[3])\n",
-       "    cu_seqlens_q = torch.arange(0, q.shape[0] + 1, dtype=torch.int32, device=q.device) * seqlen\n",
-       "    cu_seqlens_k = torch.arange(0, q.shape[0] + 1, dtype=torch.int32, device=q.device) * seqlen\n",
-       "    o, _ = vllm_flash_attention(q_flat, k_flat, v_flat, None, cu_seqlens_q, cu_seqlens_k, seqlen, seqlen, False, 1.0, None)\n",
-       "    return o.view(q.shape[0], seqlen, q.shape[1], q.shape[3]).permute(0, 2, 1, 3)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b911bed7",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def benchmark_flash_attention(batch, nheads, head_dim, seqlen):\n",
-       "    q = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
-       "    k = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
-       "    v = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
-       "\n",
-       "    torch.cuda.synchronize()\n",
-       "    start = time.time()\n",
-       "    out_torch = run_pytorch_sdpa(q, k, v)\n",
-       "    torch.cuda.synchronize()\n",
-       "    pytorch_time = time.time() - start\n",
-       "\n",
-       "    torch.cuda.synchronize()\n",
-       "    start = time.time()\n",
-       "    out_vllm = run_vllm_flash_attention(q, k, v, seqlen)\n",
-       "    torch.cuda.synchronize()\n",
-       "    vllm_time = time.time() - start\n",
-       "\n",
-       "    diff_vllm = torch.max(torch.abs(out_torch - out_vllm)).item()\n",
-       "    return pytorch_time, vllm_time, diff_vllm"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "2d1ce123",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "seqlens = [128, 256, 512, 1024]\n",
-       "batch, nheads, head_dim = 32, 8, 64\n",
-       "pytorch_times, vllm_times, vllm_diffs = [], [], []\n",
-       "\n",
-       "for seqlen in seqlens:\n",
-       "    t_pt, t_vllm, d_vllm = benchmark_flash_attention(batch, nheads, head_dim, seqlen)\n",
-       "    pytorch_times.append(t_pt)\n",
-       "    vllm_times.append(t_vllm)\n",
-       "    vllm_diffs.append(d_vllm)\n",
-       "    print(f\"Seqlen={seqlen}: PyTorch CUDA={t_pt:.4f}s, vLLM CUDA={t_vllm:.4f}s, Diff(vLLM)={d_vllm:.2e}\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b99804e7-4693-445e-a473-e2c243f77f70",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "!ls /workspace/.triton/cache"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "6b8fe26d",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "plt.figure()\n",
-       "plt.plot(seqlens, pytorch_times, label=\"PyTorch SDPA (CUDA)\")\n",
-       "plt.plot(seqlens, vllm_times, label=\"vLLM Flash Attention (CUDA)\")\n",
-       "plt.xlabel(\"Sequence Length\")\n",
-       "plt.ylabel(\"Time (s)\")\n",
-       "plt.title(\"Flash Attention Performance: PyTorch vs vLLM on CUDA\")\n",
-       "plt.legend()\n",
-       "plt.grid()\n",
-       "plt.show()"
-      ]
-     },
-     {
-      "cell_type": "markdown",
-      "id": "212e60a2",
-      "metadata": {},
-      "source": [
-       "## What is Triton Autotuning?\n",
-       "Triton allows kernels to be **autotuned**, meaning it will try multiple kernel configurations (block sizes, warp counts, pipeline stages) to find the optimal setup for your specific GPU hardware and workload shape.\n",
-       "\n",
-       "This autotuning process significantly improves performance and ensures the kernel is utilizing the GPU most efficiently.\n",
-       "\n",
-       "**How does it work?**  \n",
-       "- Triton runs benchmarks internally with different configurations.  \n",
-       "- It measures which configurations are fastest.  \n",
-       "- The result is cached, so future runs use the best-found setup.\n",
-       "\n",
-       "**Why do we re-run tuning?**  \n",
-       "- Hardware setups or driver versions may change.  \n",
-       "- Workload shapes (sequence lengths, batch sizes) might differ from defaults.  \n",
-       "- We want to confirm we’re using the best configuration for *this exact benchmark*.\n",
-       "\n",
-       "In the next cell, we trigger this autotuning pass.\n"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "f972971b-fef9-4814-926a-da87432b47bb",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# Trigger re-tuning (will reuse cached or search if needed)\n",
-       "vllm_benchmark.run(show_plots=False, print_data=True)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "1ec96d79-50fe-4814-a668-27c95ceaa04f",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "vllm_tuned_times = []\n",
-       "\n",
-       "for seqlen in seqlens:\n",
-       "    q = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
-       "    k = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
-       "    v = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
-       "\n",
-       "    torch.cuda.synchronize()\n",
-       "    start = time.time()\n",
-       "    out_vllm_tuned = run_vllm_flash_attention(q, k, v, seqlen)\n",
-       "    torch.cuda.synchronize()\n",
-       "    tuned_time = time.time() - start\n",
-       "    vllm_tuned_times.append(tuned_time)\n",
-       "    print(f\"Seqlen={seqlen}: Tuned vLLM CUDA={tuned_time:.4f}s\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "0a7e9593-48a3-42f0-a94d-6a45a4388354",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "print(f\"{'SeqLen':>8} | {'PyTorch Time (s)':>18} | {'vLLM Tuned Time (s)':>20} | {'Speedup (PyTorch/vLLM)':>24}\")\n",
-       "print(\"-\" * 75)\n",
-       "for seqlen, pt_time, tuned_time in zip(seqlens, pytorch_times, vllm_tuned_times):\n",
-       "    speedup = pt_time / tuned_time\n",
-       "    print(f\"{seqlen:8} | {pt_time:18.6f} | {tuned_time:20.6f} | {speedup:24.2f}x\")\n"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "627a2f59-96ae-4994-854e-b1c9259529bd",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "plt.figure()\n",
-       "plt.plot(seqlens, pytorch_times, label=\"PyTorch SDPA (CUDA)\")\n",
-       "plt.plot(seqlens, vllm_times, label=\"vLLM (Original)\")\n",
-       "plt.plot(seqlens, vllm_tuned_times, label=\"vLLM (Autotuned)\")\n",
-       "plt.xlabel(\"Sequence Length\")\n",
-       "plt.ylabel(\"Time (s)\")\n",
-       "plt.title(\"Flash Attention Benchmark: PyTorch vs vLLM (Before & After Autotune)\")\n",
-       "plt.legend()\n",
-       "plt.grid()\n",
-       "plt.show()"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "fd225668-944f-4058-a20e-973188c7442e",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "plt.figure()\n",
-       "plt.plot(seqlens, pytorch_times, label=\"PyTorch SDPA (CUDA)\")\n",
-       "plt.plot(seqlens, vllm_tuned_times, label=\"vLLM (Autotuned)\")\n",
-       "plt.xlabel(\"Sequence Length\")\n",
-       "plt.ylabel(\"Time (s)\")\n",
-       "plt.title(\"Flash Attention Benchmark: PyTorch vs vLLM (After Autotune)\")\n",
-       "plt.legend()\n",
-       "plt.grid()\n",
-       "plt.show()"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "487d9793-b35d-4fa6-ab05-7843d5fe96b5",
-      "metadata": {},
-      "outputs": [],
-      "source": []
-     }
-    ],
-    "metadata": {
-     "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-     },
-     "language_info": {
-      "codemirror_mode": {
-       "name": "ipython",
-       "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.12.5"
-     }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-   }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9012182b",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "This notebook benchmarks PyTorch's scaled_dot_product_attention (SDPA) against a vLLMs Triton-based flash attention kernel.\n",
+    "\n",
+    "Key highlights:\n",
+    "- Environment Setup: GPU checks and Triton installation.\n",
+    "- Baseline Performance: Measure PyTorch SDPA runtimes for various sequence lengths.\n",
+    "- vLLM Triton Kernel Benchmark: Compare initial vLLM kernel performance vs. PyTorch.\n",
+    "  - Triton Autotuning & Caching:\n",
+    "    - The first run triggers autotuning (testing multiple configurations), making it slower.\n",
+    "    - The best configuration is cached for future runs.\n",
+    "    - Subsequent runs reuse the cached kernel and run significantly faster without re-tuning.\n",
+    "- Visualization: Clear plots show performance improvements before and after autotuning.\n",
+    "- Speedup Summary: A table and plots demonstrate consistent 2-4x speedups compared to PyTorch after caching.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29482448-d69e-4412-82b8-e6b8243699fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python triton-gpu-check.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "591e3d51-8a52-45b5-b6a8-99acd0b1180c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd triton && pip install ./python && cd -"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2278ff2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import triton\n",
+    "import triton.language as tl\n",
+    "import matplotlib.pyplot as plt\n",
+    "import time\n",
+    "\n",
+    "print(\"Torch version:\", torch.__version__)\n",
+    "print(\"Triton version:\", triton.__version__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "75bc884d",
+   "metadata": {},
+   "source": [
+    "## Flash Attention Benchmark (PyTorch SDPA vs vLLM Kernel)\n",
+    "This notebook benchmarks the PyTorch `scaled_dot_product_attention` against the vLLM Triton-based flash attention kernel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "vllm-import",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Assuming vllm_flash_attention.py is present in the same directory or accessible path\n",
+    "from flash_attention import triton_attention as vllm_flash_attention\n",
+    "from flash_attention import benchmark_flash_attention as vllm_benchmark\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4958854-eb3a-44d8-a52e-c4f77cefce94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls /workspace/.triton/cache"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc0bbe33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_pytorch_sdpa(q, k, v):\n",
+    "    return torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ac54ef2-2877-49bb-aca6-c7ca2e6d55d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls /workspace/.triton/cache"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "vllm-kernel-wrapper",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_vllm_flash_attention(q, k, v, seqlen):\n",
+    "    q_flat = q.permute(0, 2, 1, 3).reshape(-1, q.shape[1], q.shape[3])\n",
+    "    k_flat = k.permute(0, 2, 1, 3).reshape(-1, k.shape[1], k.shape[3])\n",
+    "    v_flat = v.permute(0, 2, 1, 3).reshape(-1, v.shape[1], v.shape[3])\n",
+    "    cu_seqlens_q = torch.arange(0, q.shape[0] + 1, dtype=torch.int32, device=q.device) * seqlen\n",
+    "    cu_seqlens_k = torch.arange(0, q.shape[0] + 1, dtype=torch.int32, device=q.device) * seqlen\n",
+    "    o, _ = vllm_flash_attention(q_flat, k_flat, v_flat, None, cu_seqlens_q, cu_seqlens_k, seqlen, seqlen, False, 1.0, None)\n",
+    "    return o.view(q.shape[0], seqlen, q.shape[1], q.shape[3]).permute(0, 2, 1, 3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b911bed7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def benchmark_flash_attention(batch, nheads, head_dim, seqlen):\n",
+    "    q = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
+    "    k = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
+    "    v = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
+    "\n",
+    "    torch.cuda.synchronize()\n",
+    "    start = time.time()\n",
+    "    out_torch = run_pytorch_sdpa(q, k, v)\n",
+    "    torch.cuda.synchronize()\n",
+    "    pytorch_time = time.time() - start\n",
+    "\n",
+    "    torch.cuda.synchronize()\n",
+    "    start = time.time()\n",
+    "    out_vllm = run_vllm_flash_attention(q, k, v, seqlen)\n",
+    "    torch.cuda.synchronize()\n",
+    "    vllm_time = time.time() - start\n",
+    "\n",
+    "    diff_vllm = torch.max(torch.abs(out_torch - out_vllm)).item()\n",
+    "    return pytorch_time, vllm_time, diff_vllm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d1ce123",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seqlens = [128, 256, 512, 1024]\n",
+    "batch, nheads, head_dim = 32, 8, 64\n",
+    "pytorch_times, vllm_times, vllm_diffs = [], [], []\n",
+    "\n",
+    "for seqlen in seqlens:\n",
+    "    t_pt, t_vllm, d_vllm = benchmark_flash_attention(batch, nheads, head_dim, seqlen)\n",
+    "    pytorch_times.append(t_pt)\n",
+    "    vllm_times.append(t_vllm)\n",
+    "    vllm_diffs.append(d_vllm)\n",
+    "    print(f\"Seqlen={seqlen}: PyTorch CUDA={t_pt:.4f}s, vLLM CUDA={t_vllm:.4f}s, Diff(vLLM)={d_vllm:.2e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b99804e7-4693-445e-a473-e2c243f77f70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls /workspace/.triton/cache"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b8fe26d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(seqlens, pytorch_times, label=\"PyTorch SDPA (CUDA)\")\n",
+    "plt.plot(seqlens, vllm_times, label=\"vLLM Flash Attention (CUDA)\")\n",
+    "plt.xlabel(\"Sequence Length\")\n",
+    "plt.ylabel(\"Time (s)\")\n",
+    "plt.title(\"Flash Attention Performance: PyTorch vs vLLM on CUDA\")\n",
+    "plt.legend()\n",
+    "plt.grid()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "212e60a2",
+   "metadata": {},
+   "source": [
+    "## What is Triton Autotuning?\n",
+    "Triton allows kernels to be **autotuned**, meaning it will try multiple kernel configurations (block sizes, warp counts, pipeline stages) to find the optimal setup for your specific GPU hardware and workload shape.\n",
+    "\n",
+    "This autotuning process significantly improves performance and ensures the kernel is utilizing the GPU most efficiently.\n",
+    "\n",
+    "**How does it work?**  \n",
+    "- Triton runs benchmarks internally with different configurations.  \n",
+    "- It measures which configurations are fastest.  \n",
+    "- The result is cached, so future runs use the best-found setup.\n",
+    "\n",
+    "**Why do we re-run tuning?**  \n",
+    "- Hardware setups or driver versions may change.  \n",
+    "- Workload shapes (sequence lengths, batch sizes) might differ from defaults.  \n",
+    "- We want to confirm we’re using the best configuration for *this exact benchmark*.\n",
+    "\n",
+    "In the next cell, we trigger this autotuning pass.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51ef7113",
+   "metadata": {},
+   "source": [
+    "## Note on Triton Autotuning and Caching Example\n",
+    "\n",
+    "- On the **first run**, when a specific kernel configuration (based on GPU hardware, batch size, sequence length, and head dimensions) is encountered for the first time, **Triton triggers autotuning**.  \n",
+    "   - This process tries multiple kernel configurations in the background and picks the fastest one.\n",
+    "   - As a result, the **first run may be significantly slower** due to this tuning process.\n",
+    "\n",
+    "- Once the best-performing configuration is found, it is **stored in Triton's cache** (typically in `/workspace/.triton/cache`).\n",
+    "\n",
+    "- On **subsequent runs** with the same input shape and environment:\n",
+    "   - Triton **loads the tuned configuration from cache** and skips tuning.\n",
+    "   - This leads to **consistently fast kernel launches and execution** without re-tuning overhead.\n",
+    "\n",
+    "-  If you clear the cache, the next run will re-trigger autotuning.\n",
+    "\n",
+    "> In short:  \n",
+    "> - First run = autotuning + execution (slow but smart)  \n",
+    "> - All future runs = cached config + execution (fast and efficient)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f972971b-fef9-4814-926a-da87432b47bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Trigger re-tuning (will reuse cached or search if needed)\n",
+    "vllm_benchmark.run(show_plots=False, print_data=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ec96d79-50fe-4814-a668-27c95ceaa04f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vllm_tuned_times = []\n",
+    "\n",
+    "for seqlen in seqlens:\n",
+    "    q = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
+    "    k = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
+    "    v = torch.randn(batch, nheads, seqlen, head_dim, device='cuda')\n",
+    "\n",
+    "    torch.cuda.synchronize()\n",
+    "    start = time.time()\n",
+    "    out_vllm_tuned = run_vllm_flash_attention(q, k, v, seqlen)\n",
+    "    torch.cuda.synchronize()\n",
+    "    tuned_time = time.time() - start\n",
+    "    vllm_tuned_times.append(tuned_time)\n",
+    "    print(f\"Seqlen={seqlen}: Tuned vLLM CUDA={tuned_time:.4f}s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a7e9593-48a3-42f0-a94d-6a45a4388354",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"{'SeqLen':>8} | {'PyTorch Time (s)':>18} | {'vLLM Tuned Time (s)':>20} | {'Speedup (PyTorch/vLLM)':>24}\")\n",
+    "print(\"-\" * 75)\n",
+    "for seqlen, pt_time, tuned_time in zip(seqlens, pytorch_times, vllm_tuned_times):\n",
+    "    speedup = pt_time / tuned_time\n",
+    "    print(f\"{seqlen:8} | {pt_time:18.6f} | {tuned_time:20.6f} | {speedup:24.2f}x\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "627a2f59-96ae-4994-854e-b1c9259529bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(seqlens, pytorch_times, label=\"PyTorch SDPA (CUDA)\")\n",
+    "plt.plot(seqlens, vllm_times, label=\"vLLM (Original)\")\n",
+    "plt.plot(seqlens, vllm_tuned_times, label=\"vLLM (Autotuned)\")\n",
+    "plt.xlabel(\"Sequence Length\")\n",
+    "plt.ylabel(\"Time (s)\")\n",
+    "plt.title(\"Flash Attention Benchmark: PyTorch vs vLLM (Before & After Autotune)\")\n",
+    "plt.legend()\n",
+    "plt.grid()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd225668-944f-4058-a20e-973188c7442e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(seqlens, pytorch_times, label=\"PyTorch SDPA (CUDA)\")\n",
+    "plt.plot(seqlens, vllm_tuned_times, label=\"vLLM (Autotuned)\")\n",
+    "plt.xlabel(\"Sequence Length\")\n",
+    "plt.ylabel(\"Time (s)\")\n",
+    "plt.title(\"Flash Attention Benchmark: PyTorch vs vLLM (After Autotune)\")\n",
+    "plt.legend()\n",
+    "plt.grid()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "487d9793-b35d-4fa6-ab05-7843d5fe96b5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}