From 5edeea21fa841c739635dbe98f48fb9df7df54fb Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 7 Aug 2024 17:16:04 +0300 Subject: [PATCH 1/4] Make OpenTelemetry availability errors clearer --- vllm/config.py | 10 ++++++---- vllm/tracing.py | 21 +++++++++++++-------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index a5a9984a0114a..bb8242ccade1c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform -from vllm.tracing import is_otel_installed +from vllm.tracing import is_otel_available, otel_import_err from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes, cuda_device_count_stateless, get_cpu_memory, is_cpu, @@ -1721,9 +1721,11 @@ class ObservabilityConfig: collect_model_execute_time: bool = False def __post_init__(self): - if not is_otel_installed() and self.otlp_traces_endpoint is not None: - raise ValueError("OpenTelemetry packages must be installed before " - "configuring 'otlp_traces_endpoint'") + if not is_otel_available() and self.otlp_traces_endpoint is not None: + raise ValueError( + "OpenTelemetry is not available. Unable to configure " + "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " + "installed.") from otel_import_err if ((self.collect_model_forward_time or self.collect_model_execute_time) diff --git a/vllm/tracing.py b/vllm/tracing.py index 8bd71b8fd9ea5..df146b2fe952e 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -8,7 +8,8 @@ logger = init_logger(__name__) -_is_otel_installed = False +_is_otel_imported = False +otel_import_err = None try: from opentelemetry.context.context import Context from opentelemetry.sdk.environment_variables import ( @@ -19,8 +20,9 @@ from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider from opentelemetry.trace.propagation.tracecontext import ( TraceContextTextMapPropagator) - _is_otel_installed = True -except ImportError: + _is_otel_imported = True +except ImportError as e: + otel_import_err = e class Context: # type: ignore pass @@ -35,14 +37,17 @@ class Tracer: # type: ignore pass -def is_otel_installed() -> bool: - return _is_otel_installed +def is_otel_available() -> bool: + return _is_otel_imported def init_tracer(instrumenting_module_name: str, otlp_traces_endpoint: str) -> Optional[Tracer]: - assert is_otel_installed(), ("OpenTelemetry packages must be installed " - "prior to initializing a tracer") + if not is_otel_available(): + raise ValueError( + "OpenTelemetry is not available. Unable to initialize " + "a tracer. Ensure OpenTelemetry packages are installed." + ) from otel_import_err trace_provider = TracerProvider() span_exporter = get_span_exporter(otlp_traces_endpoint) @@ -70,7 +75,7 @@ def get_span_exporter(endpoint): def extract_trace_context( headers: Optional[Mapping[str, str]]) -> Optional[Context]: - if is_otel_installed(): + if is_otel_available(): headers = headers or {} return TraceContextTextMapPropagator().extract(headers) else: From 6b7396d049a6a3e251a1a5a9cff75a4e106eb2b6 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 7 Aug 2024 17:27:12 +0300 Subject: [PATCH 2/4] Pin OpenTelemetry package versions --- .buildkite/test-pipeline.yaml | 8 ++++---- examples/production_monitoring/Otel.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7babffc62f431..2a5b56b4bb364 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -111,10 +111,10 @@ steps: commands: - pytest -v -s metrics - "pip install \ - opentelemetry-sdk \ - opentelemetry-api \ - opentelemetry-exporter-otlp \ - opentelemetry-semantic-conventions-ai" + opentelemetry-sdk>=1.26.0,<1.27.0 \ + opentelemetry-api>=1.26.0,<1.27.0 \ + opentelemetry-exporter-otlp>=1.26.0,<1.27.0 \ + opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0" - pytest -v -s tracing ##### fast check tests ##### diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md index 2c7a7caa1bd7c..96d1f96bfa144 100644 --- a/examples/production_monitoring/Otel.md +++ b/examples/production_monitoring/Otel.md @@ -3,10 +3,10 @@ 1. Install OpenTelemetry packages: ``` pip install \ - opentelemetry-sdk \ - opentelemetry-api \ - opentelemetry-exporter-otlp \ - opentelemetry-semantic-conventions-ai + 'opentelemetry-sdk>=1.26.0,<1.27.0' \ + 'opentelemetry-api>=1.26.0,<1.27.0' \ + 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0' ``` 1. Start Jaeger in a docker container: From 66f549933f6833b1ff9fd1f8f80883cf36f993a9 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Mon, 19 Aug 2024 15:49:40 +0300 Subject: [PATCH 3/4] Fix YAML syntax for OpenTelemetry pip install command --- .buildkite/test-pipeline.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2a5b56b4bb364..df7d45a4e9b8e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -111,10 +111,10 @@ steps: commands: - pytest -v -s metrics - "pip install \ - opentelemetry-sdk>=1.26.0,<1.27.0 \ - opentelemetry-api>=1.26.0,<1.27.0 \ - opentelemetry-exporter-otlp>=1.26.0,<1.27.0 \ - opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0" + 'opentelemetry-sdk>=1.26.0,<1.27.0' \ + 'opentelemetry-api>=1.26.0,<1.27.0' \ + 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'" - pytest -v -s tracing ##### fast check tests ##### From 3e46b4845095bf09d23074236d7a726a8fafd608 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Tue, 20 Aug 2024 15:38:08 +0300 Subject: [PATCH 4/4] Store string representation of the error to avoid memory leaks --- vllm/config.py | 4 ++-- vllm/tracing.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index bb8242ccade1c..0d5d098bc8858 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform -from vllm.tracing import is_otel_available, otel_import_err +from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes, cuda_device_count_stateless, get_cpu_memory, is_cpu, @@ -1725,7 +1725,7 @@ def __post_init__(self): raise ValueError( "OpenTelemetry is not available. Unable to configure " "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " - "installed.") from otel_import_err + f"installed. Original error:\n{otel_import_error_traceback}") if ((self.collect_model_forward_time or self.collect_model_execute_time) diff --git a/vllm/tracing.py b/vllm/tracing.py index df146b2fe952e..31849e2b635aa 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -9,7 +9,7 @@ logger = init_logger(__name__) _is_otel_imported = False -otel_import_err = None +otel_import_error_traceback: Optional[str] = None try: from opentelemetry.context.context import Context from opentelemetry.sdk.environment_variables import ( @@ -21,8 +21,13 @@ from opentelemetry.trace.propagation.tracecontext import ( TraceContextTextMapPropagator) _is_otel_imported = True -except ImportError as e: - otel_import_err = e +except ImportError: + # Capture and format traceback to provide detailed context for the import + # error. Only the string representation of the error is retained to avoid + # memory leaks. + # See https://github.com/vllm-project/vllm/pull/7266#discussion_r1707395458 + import traceback + otel_import_error_traceback = traceback.format_exc() class Context: # type: ignore pass @@ -46,8 +51,8 @@ def init_tracer(instrumenting_module_name: str, if not is_otel_available(): raise ValueError( "OpenTelemetry is not available. Unable to initialize " - "a tracer. Ensure OpenTelemetry packages are installed." - ) from otel_import_err + "a tracer. Ensure OpenTelemetry packages are installed. " + f"Original error:\n{otel_import_error_traceback}") trace_provider = TracerProvider() span_exporter = get_span_exporter(otlp_traces_endpoint)