ROCm · sanyalington · Aug 14, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -9,7 +9,7 @@
 from vllm._custom_C import paged_attention_custom
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 
-NUM_BLOCKS = 1024
+NUM_BLOCKS = 1024 * 1024
 PARTITION_SIZE = 256
 
 
@@ -176,7 +176,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     if do_profile:
         latency = run_benchmark(num_iters=1, profile=True)
     else:
-        latency = run_benchmark(num_iters=100, profile=False)
+        latency = run_benchmark(num_iters=1000, profile=False)
     print(f"Kernel running time: {latency * 1000000:.3f} us")