Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3765f9f
draft
Isotr0py Oct 7, 2025
41cd6e9
Merge branch 'vllm-project:main' into torch-utils
Isotr0py Oct 15, 2025
7dffb1e
update mop context
Isotr0py Oct 15, 2025
20876be
update
Isotr0py Oct 15, 2025
ea5ad7a
Merge remote-tracking branch 'upstream/main' into torch-utils
Isotr0py Oct 15, 2025
a4ac619
move STR_DTYPE_TO_TORCH_DTYPE and kv_caches utils
Isotr0py Oct 15, 2025
47601ee
move current_stream
Isotr0py Oct 16, 2025
7cacad0
move cuda_device_count_stateless
Isotr0py Oct 16, 2025
eaa7a95
Merge remote-tracking branch 'upstream/main' into torch-utils
Isotr0py Oct 16, 2025
b97b512
weak_ref_tensors and get_cuda_view_from_cpu_tensor
Isotr0py Oct 16, 2025
8648ac2
move torch verison helper and ops registration
Isotr0py Oct 16, 2025
b0853f8
rename torch_utils.py to torch.py
Isotr0py Oct 16, 2025
32923fd
fix import
Isotr0py Oct 16, 2025
9a78766
rename back to torch_utils to avoid conflicts
Isotr0py Oct 16, 2025
703d667
Merge branch 'main' into torch-utils
Isotr0py Oct 16, 2025
e819af6
move make_tensor_with_pad
Isotr0py Oct 16, 2025
041075b
Merge remote-tracking branch 'upstream/main' into torch-utils
Isotr0py Oct 17, 2025
1e13e3a
fix
Isotr0py Oct 17, 2025
c1f5b17
Merge branch 'main' into torch-utils
Isotr0py Oct 17, 2025
aab5f12
Merge branch 'main' into torch-utils
Isotr0py Oct 17, 2025
e67b2f1
Merge branch 'main' into torch-utils
Isotr0py Oct 17, 2025
6655659
fix
Isotr0py Oct 18, 2025
515c98c
Merge remote-tracking branch 'upstream/main' into torch-utils
Isotr0py Oct 18, 2025
218714e
Merge remote-tracking branch 'upstream/main' into torch-utils
Isotr0py Oct 18, 2025
81c925c
Merge remote-tracking branch 'upstream/main' into torch-utils
Isotr0py Oct 18, 2025
2c28a3a
move kv_cache_dtype_str_to_dtype
Isotr0py Oct 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/kernels/bench_per_token_quant_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


def with_triton_mode(fn):
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/kernels/benchmark_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/kernels/benchmark_layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)

Expand Down
3 changes: 2 additions & 1 deletion benchmarks/kernels/benchmark_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/kernels/benchmark_reshape_and_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)

Expand Down
4 changes: 2 additions & 2 deletions benchmarks/kernels/benchmark_reshape_and_cache_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random_flash,
)

Expand Down
2 changes: 1 addition & 1 deletion tests/compile/piecewise/test_full_cudagraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig
from vllm.platforms import current_platform
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer


@contextlib.contextmanager
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/piecewise/test_multiple_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
set_current_vllm_config,
)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer

# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/piecewise/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
set_current_vllm_config,
)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer

# This import automatically registers `torch.ops.silly.attention`
from ..silly_attention import get_global_counter, reset_global_counter
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/piecewise/test_toy_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
set_current_vllm_config,
)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer

# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/silly_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import torch
from torch.library import Library

from vllm.utils import direct_register_custom_op
from vllm.utils.torch_utils import direct_register_custom_op

# Shared library for all compilation test operations
# Using "silly" namespace to match existing test expectations
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_aot_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
set_current_vllm_config,
)
from vllm.forward_context import set_forward_context
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer


def reference_fn(x: torch.Tensor):
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

from vllm.config import CompilationMode
from vllm.utils import cuda_device_count_stateless
from vllm.utils.torch_utils import cuda_device_count_stateless

from ..utils import compare_all_settings

Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
from vllm.config.compilation import CompilationMode
from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
from vllm.utils.torch_utils import _is_torch_equal_or_newer, is_torch_equal_or_newer


def test_version():
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
set_current_vllm_config,
)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer

# This import automatically registers `torch.ops.silly.attention`
from . import silly_attention # noqa: F401
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_full_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer

from ..utils import create_new_process_for_each_test

Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_fusions_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from vllm import LLM, SamplingParams
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.flashinfer import has_flashinfer
from vllm.utils.torch_utils import is_torch_equal_or_newer

from ..utils import flat_product, multi_gpu_test

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils import set_default_torch_num_threads
from vllm.utils.collections import is_list_of
from vllm.utils.torch_utils import set_default_torch_num_threads

logger = init_logger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion tests/distributed/test_sequence_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from vllm.config.compilation import CompilationMode
from vllm.config.model import RunnerOption
from vllm.logger import init_logger
from vllm.utils import is_torch_equal_or_newer
from vllm.utils.torch_utils import is_torch_equal_or_newer

from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, create_new_process_for_each_test
Expand Down
2 changes: 1 addition & 1 deletion tests/distributed/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.utils import StatelessProcessGroup
from vllm.utils import (
cuda_device_count_stateless,
get_open_port,
update_environment_variables,
)
from vllm.utils.torch_utils import cuda_device_count_stateless

from ..utils import multi_gpu_test

Expand Down
5 changes: 4 additions & 1 deletion tests/kernels/attention/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

import pytest

from vllm.utils import create_kv_caches_with_random, create_kv_caches_with_random_flash
from vllm.utils.torch_utils import (
create_kv_caches_with_random,
create_kv_caches_with_random_flash,
)


@pytest.fixture()
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/attention/test_prefix_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
from vllm.attention.ops.prefix_prefill import context_attention_fwd
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

NUM_HEADS = [64]
NUM_QUERIES_PER_KV = [1, 64]
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/core/test_uva.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import pytest
import torch

from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
from vllm.utils import is_uva_available
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor

CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]

Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/moe/test_modular_kernel_combinations.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.platforms import current_platform
from vllm.utils import cuda_device_count_stateless, has_deep_ep, has_deep_gemm, has_pplx
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.torch_utils import cuda_device_count_stateless

from .modular_kernel_tools.common import (
Config,
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
STR_BACKEND_ENV_VAR,
STR_FLASH_ATTN_VAL,
STR_XFORMERS_ATTN_VAL,
make_tensor_with_pad,
)
from vllm.utils.torch_utils import make_tensor_with_pad

# For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4.
Expand Down
2 changes: 1 addition & 1 deletion tests/models/multimodal/pooling/test_intern_vit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from transformers import AutoConfig, AutoModel, CLIPImageProcessor

from vllm.distributed import cleanup_dist_env_and_memory
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

from ....conftest import ImageTestAssets

Expand Down
2 changes: 1 addition & 1 deletion tests/models/multimodal/pooling/test_radio.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.models.radio import RadioModel
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

from ....conftest import ImageTestAssets

Expand Down
2 changes: 1 addition & 1 deletion tests/models/multimodal/processing/test_tensor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.models.interfaces import (
SupportsMultiModal,
supports_multimodal,
Expand All @@ -36,6 +35,7 @@
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils.collections import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype

from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
from ...utils import dummy_hf_overrides
Expand Down
2 changes: 1 addition & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import (
FlexibleArgumentParser,
cuda_device_count_stateless,
get_open_port,
)
from vllm.utils.mem_constants import GB_bytes
from vllm.utils.torch_utils import cuda_device_count_stateless

if current_platform.is_rocm():
from amdsmi import (
Expand Down
10 changes: 6 additions & 4 deletions tests/utils_/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,8 @@
from vllm.utils import (
FlexibleArgumentParser,
bind_kv_cache,
common_broadcastable_dtype,
current_stream,
get_open_port,
get_tcp_uri,
is_lossless_cast,
join_host_port,
make_zmq_path,
make_zmq_socket,
Expand All @@ -37,6 +34,11 @@
split_zmq_path,
unique_filepath,
)
from vllm.utils.torch_utils import (
common_broadcastable_dtype,
current_stream,
is_lossless_cast,
)

from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
from ..utils import create_new_process_for_each_test, flat_product
Expand Down Expand Up @@ -408,7 +410,7 @@ def test_bind_kv_cache_non_attention():


def test_bind_kv_cache_pp():
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
# this test runs with 1 GPU, but we simulate 2 GPUs
cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
with set_current_vllm_config(cfg):
Expand Down
3 changes: 2 additions & 1 deletion tests/v1/attention/test_attention_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from vllm.attention.backends.registry import _Backend
from vllm.config import ModelConfig
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
from vllm.utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
set_kv_cache_layout,
Expand Down
3 changes: 2 additions & 1 deletion tests/v1/attention/test_mla_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from vllm.attention.backends.registry import _Backend
from vllm.attention.ops.flashmla import is_flashmla_dense_supported
from vllm.config.vllm import set_current_vllm_config
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
from vllm.utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec

Expand Down
2 changes: 1 addition & 1 deletion tests/v1/engine/test_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.utils import set_default_torch_num_threads
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.loggers import (
AggregatedLoggingStatLogger,
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm import SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.utils import set_default_torch_num_threads
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
from vllm.v1.executor.abstract import Executor, UniProcExecutor
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/engine/test_engine_core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.usage.usage_lib import UsageContext
from vllm.utils import set_default_torch_num_threads
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
Expand Down
3 changes: 2 additions & 1 deletion tests/v1/sample/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from tests.v1.sample.utils import create_allowed_token_ids
from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available, make_tensor_with_pad
from vllm.utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.sampler import Sampler
Expand Down
Loading