diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 4bbacd1abfdf4..1ea9b6b92024e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -62,19 +62,9 @@ # Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. # NOTE: _get_graph_batch_size needs to be updated if this list is changed. _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ - _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) + _BATCH_SIZE_ALIGNMENT * i for i in range(1, 129) ] -# Get the current GPU properties -gpu_properties = torch.cuda.get_device_properties(0) -# Retrieve the total memory in GB -mem = gpu_properties.total_memory / 1024**3 -# Retrieve the SM version -gpu_sm_version = gpu_properties.major + gpu_properties.minor / 10.0 -# extend cuda graph for H200 GPUs -if mem > 120.0 and gpu_sm_version >= 9.0: - _BATCH_SIZES_TO_CAPTURE.extend([512, 768]) - _NUM_WARMUP_ITERS = 2 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")