diff --git a/requirements/common.txt b/requirements/common.txt index 9a9ae1d93896..90d594b6f45b 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -44,4 +44,9 @@ watchfiles # required for http server to monitor the updates of TLS files python-json-logger # Used by logging as per examples/others/logging_configuration.md scipy # Required for phi-4-multimodal-instruct ninja # Required for xgrammar, rocm, tpu, xpu +opentelemetry-sdk>=1.26.0 # vllm.tracing +opentelemetry-api>=1.26.0 # vllm.tracing +opentelemetry-exporter-otlp>=1.26.0 # vllm.tracing +opentelemetry-semantic-conventions-ai>=0.4.1 # vllm.tracing +opentelemetry-instrumentation-fastapi>=0.47b0 pybase64 # fast base64 implementation diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3d7d28055dd0..df8ad590903e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -12,7 +12,8 @@ import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VllmConfig) + ObservabilityConfig, ParallelConfig, SchedulerConfig, + VllmConfig) from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout @@ -1115,6 +1116,10 @@ async def get_model_config(self) -> ModelConfig: """Get the model configuration of the vLLM engine.""" return self.engine.get_model_config() + async def get_observability_config(self) -> "ObservabilityConfig": + """Get the observability configuration of the vLLM engine.""" + return self.engine.get_observability_config() + async def get_parallel_config(self) -> ParallelConfig: """Get the parallel configuration of the vLLM engine.""" return self.engine.get_parallel_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8fccf9bd2aa0..3b014e4762fe 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -832,6 +832,10 @@ def get_model_config(self) -> ModelConfig: """Gets the model configuration.""" return self.model_config + def get_observability_config(self) -> ObservabilityConfig: + """Gets the observability configuration.""" + return self.observability_config + def get_parallel_config(self) -> ParallelConfig: """Gets the parallel configuration.""" return self.parallel_config diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 9e018ec7f344..0d189827ea17 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -16,7 +16,8 @@ from zmq.asyncio import Socket from vllm import PoolingParams -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig, + VllmConfig) from vllm.core.scheduler import SchedulerOutputs # yapf conflicts with isort for this block # yapf: disable @@ -96,6 +97,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, # Get the configs. self.vllm_config = engine_config self.model_config = engine_config.model_config + self.observability_config = engine_config.observability_config self.decoding_config = engine_config.decoding_config # Create the tokenizer group. @@ -387,6 +389,9 @@ async def get_decoding_config(self) -> DecodingConfig: async def get_model_config(self) -> ModelConfig: return self.model_config + async def get_observability_config(self) -> ObservabilityConfig: + return self.observability_config + async def is_tracing_enabled(self) -> bool: return self.tracing_flag diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 8688fcc82cd9..d07c99fafe32 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -6,7 +6,8 @@ from typing import AsyncGenerator, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig, + VllmConfig) from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt @@ -249,6 +250,11 @@ async def get_model_config(self) -> ModelConfig: """Get the model configuration of the vLLM engine.""" ... + @abstractmethod + async def get_observability_config(self) -> "ObservabilityConfig": + """Get the observability configuration of the vLLM engine.""" + ... + @abstractmethod async def get_decoding_config(self) -> DecodingConfig: """Get the decoding configuration of the vLLM engine.""" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a23736470f66..16b913f51e96 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -35,7 +35,7 @@ from typing_extensions import assert_never import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import ObservabilityConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.multiprocessing.client import MQLLMEngineClient @@ -111,6 +111,25 @@ _running_tasks: set[asyncio.Task] = set() +def setup_otel(app: FastAPI, observability_config: ObservabilityConfig): + from opentelemetry import trace + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter) + from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor + from opentelemetry.sdk.resources import Resource + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + + trace.set_tracer_provider(TracerProvider(resource=Resource.create())) + + otlp_exporter = OTLPSpanExporter( + endpoint=observability_config.otlp_traces_endpoint) + trace.get_tracer_provider().add_span_processor( + BatchSpanProcessor(otlp_exporter)) + + FastAPIInstrumentor().instrument_app(app) + + @asynccontextmanager async def lifespan(app: FastAPI): try: @@ -1014,7 +1033,8 @@ def load_log_config(log_config_file: Optional[str]) -> Optional[dict]: return None -def build_app(args: Namespace) -> FastAPI: +def build_app(args: Namespace, + observability_config: ObservabilityConfig) -> FastAPI: if args.disable_fastapi_docs: app = FastAPI(openapi_url=None, docs_url=None, @@ -1025,6 +1045,9 @@ def build_app(args: Namespace) -> FastAPI: app.include_router(router) app.root_path = args.root_path + if observability_config.otlp_traces_endpoint is not None: + setup_otel(app, observability_config) + mount_metrics(app) app.add_middleware( @@ -1343,7 +1366,8 @@ async def run_server_worker(listen_address, uvicorn_kwargs['log_config'] = log_config async with build_async_engine_client(args, client_config) as engine_client: - app = build_app(args) + observability_config = await engine_client.get_observability_config() + app = build_app(args, observability_config) vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3754570dfaaa..382c4ac3fb33 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -8,7 +8,7 @@ import numpy as np import vllm.envs as envs -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, ObservabilityConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE @@ -93,6 +93,7 @@ def __init__( self.vllm_config = vllm_config self.log_requests = log_requests self.log_stats = log_stats + self.observability_config = vllm_config.observability_config # Set up stat loggers; independent set for each DP rank. self.stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( @@ -526,6 +527,9 @@ async def get_vllm_config(self) -> VllmConfig: async def get_model_config(self) -> ModelConfig: return self.model_config + async def get_observability_config(self) -> ObservabilityConfig: + return self.observability_config + async def get_decoding_config(self): raise ValueError("Not Supported on V1 yet.")