Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/aquila/conf/config_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@ defaults:

experiment:
exp_name: aquila2
exp_dir: ./outputs
exp_dir: ./outputs/${experiment.exp_name}
task:
type: inference
backend: vllm
entrypoint: flagscale/inference/inference_aquila.py
runner:
hostfile: null
cmds:
before_start: source activate flagscale
before_start: source /root/miniconda3/bin/activate flagscale
envs:
VLLM_LOGGING_LEVEL: "INFO"
CUDA_VISIBLE_DEVICES: 0
CUDA_DEVICE_MAX_CONNECTIONS: 1

Expand Down
172 changes: 172 additions & 0 deletions flagscale/backends/vllm/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
[build-system]
# Should be mirrored in requirements/build.txt
requires = [
"cmake>=3.26",
"ninja",
"packaging",
"setuptools>=61",
"setuptools-scm>=8.0",
"torch == 2.6.0",
"wheel",
"jinja2",
]
build-backend = "setuptools.build_meta"

[project]
name = "vllm"
authors = [{name = "vLLM Team"}]
license = {text = "Apache-2.0"}
readme = "README.md"
description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
]
requires-python = ">=3.9,<3.13"
dynamic = [ "version", "dependencies", "optional-dependencies"]

[project.urls]
Homepage="https://github.com/vllm-project/vllm"
Documentation="https://vllm.readthedocs.io/en/latest/"
Slack="http://slack.vllm.ai/"

[project.scripts]
vllm = "vllm.entrypoints.cli.main:main"

[tool.setuptools_scm]
# no extra settings needed, presence enables setuptools-scm

[tool.setuptools.packages.find]
where = ["."]
include = ["vllm*"]

[tool.yapfignore]
ignore_patterns = [
"build/**",
]

[tool.ruff]
# Allow lines to be as long as 80.
line-length = 80
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]

[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"]
"vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/compilation/**/*.py" = ["UP006", "UP035"]
"vllm/core/**/*.py" = ["UP006", "UP035"]
"vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
"vllm/distributed/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/lora/**/*.py" = ["UP006", "UP035"]
"vllm/model_executor/**/*.py" = ["UP006", "UP035"]
"vllm/platforms/**/*.py" = ["UP006", "UP035"]
"vllm/plugins/**/*.py" = ["UP006", "UP035"]
"vllm/profiler/**/*.py" = ["UP006", "UP035"]
"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
"vllm/utils.py" = ["UP006", "UP035"]

[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
# "I",
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]

[tool.mypy]
ignore_missing_imports = true
check_untyped_defs = true
follow_imports = "silent"

# After fixing type errors resulting from follow_imports: "skip" -> "silent",
# move the directory here and remove it from tools/mypy.sh
files = [
"vllm/*.py",
"vllm/adapter_commons",
"vllm/assets",
"vllm/entrypoints",
"vllm/core",
"vllm/inputs",
"vllm/logging_utils",
"vllm/multimodal",
"vllm/platforms",
"vllm/transformers_utils",
"vllm/triton_utils",
"vllm/usage",
]
# TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude = [
"vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
# Ignore triton kernels in ops.
'vllm/attention/ops/.*\.py$'
]

[tool.codespell]
ignore-words-list = "dout, te, indicies, subtile, ElementE"
skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"

[tool.isort]
use_parentheses = true
skip_gitignore = true

[tool.pytest.ini_options]
markers = [
"skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests",
"quant_model: run this model test under Quantized category",
"split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1",
"optional: optional tests that are automatically skipped, include --optional to run them",
]

[tool.pymarkdown]
plugins.md004.style = "sublist" # ul-style
plugins.md013.enabled = false # line-length
plugins.md041.enabled = false # first-line-h1
plugins.md033.enabled = false # inline-html
plugins.md024.allow_different_nesting = true # no-duplicate-headers
3 changes: 1 addition & 2 deletions flagscale/backends/vllm/vllm/core/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# SPDX-License-Identifier: Apache-2.0

import contextlib # --- FLAGSCALE MODIFICATION ---
import enum
import os
import random
import time
import contextlib # --- FLAGSCALE MODIFICATION ---
from collections import deque
from dataclasses import dataclass, field
from typing import Callable, Deque, Dict, Iterable, List, Optional
Expand Down Expand Up @@ -1720,7 +1720,6 @@ def schedule(
multi_modal_placeholders=(
seq_group.multi_modal_placeholders
if scheduler_outputs.num_prefill_groups > 0 else None),
mm_processor_kwargs=seq_group.mm_processor_kwargs,
prompt_adapter_request=seq_group.prompt_adapter_request,
)
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# Copied from https://github.com/vllm-project/vllm/blob/1ad957950ffc1552af5abda78c03d88ddb67945b/vllm/distributed/device_communicators/pynccl_wrapper.py.
# Below is the original copyright:

# SPDX-License-Identifier: Apache-2.0

# This file is a pure Python wrapper for the NCCL library.
Expand Down Expand Up @@ -274,6 +271,7 @@ def ncclGetUniqueId(self) -> ncclUniqueId:
ctypes.byref(unique_id)))
return unique_id

# --- FLAGSCALE MODIFICATION BEG ---
def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
"""
Reconstructs an `ncclUniqueId` object from bytes data.
Expand All @@ -294,6 +292,7 @@ def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId:
unique_id = ncclUniqueId()
ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128)
return unique_id
# --- FLAGSCALE MODIFICATION END ---

def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
rank: int) -> ncclComm_t:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
# Copied from https://github.com/vllm-project/vllm/blob/1ad957950ffc1552af5abda78c03d88ddb67945b/vllm/distributed/kv_transfer/kv_connector/factory.py.
# Below is the original copyright:
# SPDX-License-Identifier: Apache-2.0

import importlib
from typing import TYPE_CHECKING, Callable, Dict, Type

import vllm.envs as envs
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
KVConnectorRole)
from vllm.logger import init_logger

from .base import KVConnectorBase

if TYPE_CHECKING:
from vllm.config import VllmConfig

logger = init_logger(__name__)


class KVConnectorFactory:
_registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
_registry: Dict[str, Callable[[], Type[KVConnectorBaseType]]] = {}

@classmethod
def register_connector(cls, name: str, module_path: str,
Expand All @@ -21,30 +27,60 @@ def register_connector(cls, name: str, module_path: str,
if name in cls._registry:
raise ValueError(f"Connector '{name}' is already registered.")

def loader() -> Type[KVConnectorBase]:
def loader() -> Type[KVConnectorBaseType]:
module = importlib.import_module(module_path)
return getattr(module, class_name)

cls._registry[name] = loader

@classmethod
def create_connector(cls, rank: int, local_rank: int,
config: "VllmConfig") -> KVConnectorBase:
def create_connector_v0(cls, rank: int, local_rank: int,
config: "VllmConfig") -> KVConnectorBase:
if envs.VLLM_USE_V1:
raise ValueError("Attempting to initialize a V0 Connector, "
f"but found {envs.VLLM_USE_V1=}")

connector_name = config.kv_transfer_config.kv_connector
if connector_name not in cls._registry:
raise ValueError(f"Unsupported connector type: {connector_name}")

connector_cls = cls._registry[connector_name]()
assert issubclass(connector_cls, KVConnectorBase)
return connector_cls(rank, local_rank, config)

@classmethod
def create_connector_v1(
cls,
config: "VllmConfig",
role: KVConnectorRole,
) -> KVConnectorBase_V1:
if not envs.VLLM_USE_V1:
raise ValueError("Attempting to initialize a V1 Connector, "
f"but found {envs.VLLM_USE_V1=}")

connector_name = config.kv_transfer_config.kv_connector
connector_cls = cls._registry[connector_name]()
assert issubclass(connector_cls, KVConnectorBase_V1)
logger.info("Creating v1 connector with name: %s", connector_name)
# NOTE(Kuntai): v1 connector is explicitly separated into two roles.
# Scheduler connector:
# - Co-locate with scheduler process
# - Should only be used inside the Scheduler class
# Worker connector:
# - Co-locate with worker process
# - Should only be used inside the forward context & attention layer
# We build separately to enforce strict separation
return connector_cls(config, role)


# Register various connectors here.
# The registration should not be done in each individual file, as we want to
# only load the files corresponding to the current connector.
# --- FLAGSCALE MODIFICATION BEG ---
KVConnectorFactory.register_connector(
"P2pConnector", "vllm.distributed.kv_transfer.kv_connector.p2p_connector",
"P2pConnector")

# --- FLAGSCALE MODIFICATION END ---
KVConnectorFactory.register_connector(
"PyNcclConnector",
"vllm.distributed.kv_transfer.kv_connector.simple_connector",
Expand All @@ -63,4 +99,14 @@ def create_connector(cls, rank: int, local_rank: int,
KVConnectorFactory.register_connector(
"MooncakeStoreConnector",
"vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
"MooncakeStoreConnector")
"MooncakeStoreConnector")

KVConnectorFactory.register_connector(
"SharedStorageConnector",
"vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
"SharedStorageConnector")

KVConnectorFactory.register_connector(
"LMCacheConnectorV1",
"vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector",
"LMCacheConnectorV1")
Loading
Loading