-
Notifications
You must be signed in to change notification settings - Fork 711
Description
🐛 Bug
from vllm import LLM, SamplingParams
llm = LLM(model=model_dir,enforce_eager=True)
then
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha\_triton\splitk_kernels.py:614, in autotune_kernel(kernel)
[604](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:604) WARPS_VALUES = [1, 2, 4]
[606](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:606) TRITON_CONFIGS = [
[607](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:607) gen_config(block_m, block_n, stages, warps)
[608](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:608) for block_m in BLOCK_M_VALUES
(...)
[611](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:611) for warps in WARPS_VALUES
[612](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:612) ]
--> [614](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:614) kernel = triton.autotune(
[615](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:615) configs=TRITON_CONFIGS,
[616](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:616) key=AUTOTUNER_KEY,
[617](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:617) use_cuda_graph=True,
[618](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:618) )(kernel)
[619](file:///D:/my/env/python3.10.10/lib/site-packages/xformers/ops/fmha/_triton/splitk_kernels.py:619) return kernel
TypeError: autotune() got an unexpected keyword argument 'use_cuda_graph'
Command
To Reproduce
Steps to reproduce the behavior:
1.pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu124
2.git clone https://github.com/vllm-project/vllm.git
cd vllm
python use_existing_torch.py
pip install -r requirements-common.txt
python setup.py install
3. use in vllm
from vllm import LLM, SamplingParams
llm = LLM(model=model_dir,enforce_eager=True)
Expected behavior
Environment
Please copy and paste the output from the
environment collection script from PyTorch
(or fill out the checklist below manually).
You can run the script with:python -m torch.utils.collect_env
PyTorch version: 2.5.0+cu124
Is debug build: False
CUDA used to build PyTorch: 12.4
ROCM used to build PyTorch: N/A
OS: Microsoft Windows 11 专业版
GCC version: (x86_64-posix-seh-rev0, Built by MinGW-Builds project) 13.2.0
Clang version: Could not collect
CMake version: Could not collect
Libc version: N/A
Python version: 3.10.10 (tags/v3.10.10:aad5f6a, Feb 7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)] (64-bit runtime)
Python platform: Windows-10-10.0.22631-SP0
Is CUDA available: True
CUDA runtime version: 12.5.40
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: GPU 0: NVIDIA GeForce RTX 4090
Nvidia driver version: 560.94
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
CPU:
Architecture=9
CurrentClockSpeed=3200
DeviceID=CPU0
Family=207
L2CacheSize=32768
L2CacheSpeed=
Manufacturer=GenuineIntel
MaxClockSpeed=3200
Name=13th Gen Intel(R) Core(TM) i9-13900KS
ProcessorType=3
Revision=
Versions of relevant libraries:
[pip3] mypy-extensions==1.0.0
[pip3] numpy==1.26.4
[pip3] torch==2.5.0+cu124
[pip3] torchaudio==2.5.0+cu124
[pip3] torchvision==0.20.0+cu124
[pip3] triton==2.1.0
[pip3] vector-quantize-pytorch==1.14.24
[conda] Could not collect
- PyTorch Version (e.g., 1.0):Name: torch Version: 2.5.0+cu124
- OS (e.g., Linux):windows
- How you installed PyTorch (
conda
,pip
, source):pip - Build command you used (if compiling from source):
- Python version:3.10.10
- CUDA/cuDNN version:Build cuda_12.5.r12.5/compiler.34177558_0
- GPU models and configuration:NVIDIA GeForce RTX 4090
- Any other relevant information:
Additional context
full info
TypeError Traceback (most recent call last)
Cell In[2], line 5
1 from vllm import LLM, SamplingParams
3 # model_dir='Qwen2.5-14B-Instruct-GPTQ-Int4'
----> 5 llm = LLM(model=model_dir,enforce_eager=True)
6 sampling_params = SamplingParams( top_p=0.9, max_tokens=512,top_k=10)
8 prompt = "1+1等于几"
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\utils.py:1023, in deprecate_args..wrapper..inner(*args, **kwargs)
1016 msg += f" {additional_message}"
1018 warnings.warn(
1019 DeprecationWarning(msg),
1020 stacklevel=3, # The inner function takes up one level
1021 )
-> 1023 return fn(*args, **kwargs)
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\entrypoints\llm.py:198, in LLM.init(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, disable_async_output_proc, mm_processor_kwargs, task, **kwargs)
172 kwargs["disable_log_stats"] = True
174 engine_args = EngineArgs(
175 model=model,
176 task=task,
(...)
196 **kwargs,
197 )
--> 198 self.llm_engine = LLMEngine.from_engine_args(
199 engine_args, usage_context=UsageContext.LLM_CLASS)
200 self.request_counter = Counter()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\engine\llm_engine.py:582, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers)
580 executor_class = cls._get_executor_cls(engine_config)
581 # Create the LLM engine.
--> 582 engine = cls(
583 **engine_config.to_dict(),
584 executor_class=executor_class,
585 log_stats=not engine_args.disable_log_stats,
586 usage_context=usage_context,
587 stat_loggers=stat_loggers,
588 )
590 return engine
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\engine\llm_engine.py:341, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, decoding_config, observability_config, prompt_adapter_config, executor_class, log_stats, usage_context, stat_loggers, input_registry, use_cached_outputs)
337 self.input_registry = input_registry
338 self.input_processor = input_registry.create_input_processor(
339 model_config)
--> 341 self.model_executor = executor_class(
342 model_config=model_config,
343 cache_config=cache_config,
344 parallel_config=parallel_config,
345 scheduler_config=scheduler_config,
346 device_config=device_config,
347 lora_config=lora_config,
348 speculative_config=speculative_config,
349 load_config=load_config,
350 prompt_adapter_config=prompt_adapter_config,
351 observability_config=self.observability_config,
352 )
354 if self.model_config.task != "embedding":
355 self._initialize_kv_caches()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\executor_base.py:47, in ExecutorBase.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, prompt_adapter_config, observability_config)
45 self.prompt_adapter_config = prompt_adapter_config
46 self.observability_config = observability_config
---> 47 self._init_executor()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\gpu_executor.py:38, in GPUExecutor._init_executor(self)
33 """Initialize the worker and load the model.
34 """
35 assert self.parallel_config.world_size == 1, (
36 "GPUExecutor only supports single GPU.")
---> 38 self.driver_worker = self._create_worker()
39 self.driver_worker.init_device()
40 self.driver_worker.load_model()
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\gpu_executor.py:105, in GPUExecutor._create_worker(self, local_rank, rank, distributed_init_method)
101 def _create_worker(self,
102 local_rank: int = 0,
103 rank: int = 0,
104 distributed_init_method: Optional[str] = None):
--> 105 return create_worker(**self._get_create_worker_kwargs(
106 local_rank=local_rank,
107 rank=rank,
108 distributed_init_method=distributed_init_method))
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\executor\gpu_executor.py:24, in create_worker(worker_module_name, worker_class_name, worker_class_fn, **kwargs)
16 def create_worker(worker_module_name: str, worker_class_name: str,
17 worker_class_fn: Optional[Callable[[], Type[WorkerBase]]],
18 **kwargs):
19 wrapper = WorkerWrapperBase(
20 worker_module_name=worker_module_name,
21 worker_class_name=worker_class_name,
22 worker_class_fn=worker_class_fn,
23 )
---> 24 wrapper.init_worker(**kwargs)
25 return wrapper.worker
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\worker\worker_base.py:449, in WorkerWrapperBase.init_worker(self, *args, **kwargs)
446 mod = importlib.import_module(self.worker_module_name)
447 worker_class = getattr(mod, self.worker_class_name)
--> 449 self.worker = worker_class(*args, **kwargs)
450 assert self.worker is not None
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\worker\worker.py:99, in Worker.init(self, model_config, parallel_config, scheduler_config, device_config, cache_config, load_config, local_rank, rank, distributed_init_method, lora_config, speculative_config, prompt_adapter_config, is_driver_worker, model_runner_cls, observability_config)
97 elif self._is_encoder_decoder_model():
98 ModelRunnerClass = EncoderDecoderModelRunner
---> 99 self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
100 model_config,
101 parallel_config,
102 scheduler_config,
103 device_config,
104 cache_config,
105 load_config=load_config,
106 lora_config=self.lora_config,
107 kv_cache_dtype=self.cache_config.cache_dtype,
108 is_driver_worker=is_driver_worker,
109 prompt_adapter_config=prompt_adapter_config,
110 observability_config=observability_config,
111 **speculative_args,
112 )
113 # Uninitialized cache engine. Will be initialized by
114 # initialize_cache.
115 self.cache_engine: List[CacheEngine]
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\worker\model_runner.py:1013, in GPUModelRunnerBase.init(self, model_config, parallel_config, scheduler_config, device_config, cache_config, load_config, lora_config, kv_cache_dtype, is_driver_worker, prompt_adapter_config, return_hidden_states, observability_config, input_registry, mm_registry)
1008 num_attn_heads = self.model_config.get_num_attention_heads(
1009 self.parallel_config)
1010 needs_attn_backend = (num_attn_heads != 0
1011 or self.model_config.is_attention_free)
-> 1013 self.attn_backend = get_attn_backend(
1014 self.model_config.get_head_size(),
1015 self.model_config.dtype,
1016 self.kv_cache_dtype,
1017 self.block_size,
1018 self.model_config.is_attention_free,
1019 ) if needs_attn_backend else None
1020 if self.attn_backend:
1021 self.attn_state = self.attn_backend.get_state_cls()(
1022 weakref.proxy(self))
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\attention\selector.py:120, in get_attn_backend(head_size, dtype, kv_cache_dtype, block_size, is_attention_free, is_blocksparse)
118 if backend == _Backend.XFORMERS:
119 logger.info("Using XFormers backend.")
--> 120 from vllm.attention.backends.xformers import ( # noqa: F401
121 XFormersBackend)
122 return XFormersBackend
123 elif backend == _Backend.ROCM_FLASH:
File d:\my\env\python3.10.10\lib\site-packages\vllm-0.6.3.post2.dev156+g04a3ae0a.d20241030-py3.10.egg\vllm\attention\backends\xformers.py:6
3 from typing import Any, Dict, List, Optional, Tuple, Type
5 import torch
----> 6 from xformers import ops as xops
7 from xformers.ops.fmha.attn_bias import (AttentionBias,
8 BlockDiagonalCausalMask,
9 BlockDiagonalMask,
10 LowerTriangularMaskWithTensorBias)
12 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
13 AttentionMetadata, AttentionType)
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops_init_.py:8
1 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
2 #
3 # This source code is licensed under the BSD license found in the
4 # LICENSE file in the root directory of this source tree.
6 import torch
----> 8 from .fmha import (
9 AttentionBias,
10 AttentionOp,
11 AttentionOpBase,
12 LowerTriangularMask,
13 MemoryEfficientAttentionCkOp,
14 MemoryEfficientAttentionCutlassFwdFlashBwOp,
15 MemoryEfficientAttentionCutlassOp,
16 MemoryEfficientAttentionFlashAttentionOp,
17 MemoryEfficientAttentionSplitKCkOp,
18 memory_efficient_attention,
19 memory_efficient_attention_backward,
20 memory_efficient_attention_forward,
21 memory_efficient_attention_forward_requires_grad,
22 )
23 from .indexing import index_select_cat, scaled_index_add
24 from .ipc import init_ipc
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha_init_.py:10
6 from typing import Any, List, Optional, Sequence, Tuple, Type, Union, cast
8 import torch
---> 10 from . import (
11 attn_bias,
12 ck,
13 ck_decoder,
14 ck_splitk,
15 cutlass,
16 flash,
17 flash3,
18 triton_splitk,
19 )
20 from .attn_bias import VARLEN_BIASES, AttentionBias, LowerTriangularMask
21 from .common import (
22 AttentionBwOpBase,
23 AttentionFwOpBase,
(...)
29 bmk2bmhk,
30 )
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha\triton_splitk.py:110
94 return (
95 super(InputsFp8, self).nbytes
96 + (
(...)
105 )
106 )
109 if TYPE_CHECKING or _is_triton_available():
--> 110 from ._triton.splitk_kernels import _fwd_kernel_splitK, _splitK_reduce
111 else:
112 _fwd_kernel_splitK = None
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha_triton\splitk_kernels.py:632
629 if sys.version_info >= (3, 9):
630 # unroll_varargs requires Python 3.9+
631 for num_groups in [1, 2, 4, 8]:
--> 632 _fwd_kernel_splitK_autotune[num_groups] = autotune_kernel(
633 _get_splitk_kernel(num_groups)
634 )
636 def get_autotuner_cache(
637 num_groups: int,
638 ) -> Dict[Tuple[Union[int, str]], triton.Config]:
639 """Returns a triton.runtime.autotuner.AutoTuner.cache object, which
640 represents mappings from kernel autotune keys (tuples describing kernel inputs)
641 to triton.Config
642 """
File d:\my\env\python3.10.10\lib\site-packages\xformers\ops\fmha_triton\splitk_kernels.py:614, in autotune_kernel(kernel)
604 WARPS_VALUES = [1, 2, 4]
606 TRITON_CONFIGS = [
607 gen_config(block_m, block_n, stages, warps)
608 for block_m in BLOCK_M_VALUES
(...)
611 for warps in WARPS_VALUES
612 ]
--> 614 kernel = triton.autotune(
615 configs=TRITON_CONFIGS,
616 key=AUTOTUNER_KEY,
617 use_cuda_graph=True,
618 )(kernel)
619 return kernel
TypeError: autotune() got an unexpected keyword argument 'use_cuda_graph'