[Metrics] Add --show-hidden-metrics-for-version CLI arg (#13295)

This commit is contained in:
Mark McLoughlin 2025-02-22 08:20:45 +00:00 committed by GitHub
parent 1cd981da4f
commit 2cb8c1540e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 95 additions and 1 deletions

View File

@ -36,3 +36,11 @@ The following metrics are exposed:
:language: python
:start-after: begin-metrics-definitions
:::
The following metrics are deprecated and due to be removed in a future version:
- *(No metrics are currently deprecated)*
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
and are then removed in version `X.Y+2`.

36
tests/test_version.py Normal file
View File

@ -0,0 +1,36 @@
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import patch
import pytest
from vllm import version
def test_version_is_defined():
assert version.__version__ is not None
def test_version_tuple():
assert len(version.__version_tuple__) in (3, 4, 5)
@pytest.mark.parametrize(
"version_tuple, version_str, expected",
[
((0, 0, "dev"), "0.0", True),
((0, 0, "dev"), "foobar", True),
((0, 7, 4), "0.6", True),
((0, 7, 4), "0.5", False),
((0, 7, 4), "0.7", False),
((1, 2, 3), "1.1", True),
((1, 2, 3), "1.0", False),
((1, 2, 3), "1.2", False),
# This won't work as expected
((1, 0, 0), "1.-1", True),
((1, 0, 0), "0.9", False),
((1, 0, 0), "0.17", False),
])
def test_prev_minor_version_was(version_tuple, version_str, expected):
with patch("vllm.version.__version_tuple__", version_tuple):
assert version._prev_minor_version_was(version_str) == expected

View File

@ -2653,7 +2653,9 @@ class DecodingConfig:
@dataclass
class ObservabilityConfig:
"""Configuration for observability."""
"""Configuration for observability - metrics and tracing."""
show_hidden_metrics: bool = False
otlp_traces_endpoint: Optional[str] = None
# Collecting detailed timing information for each request can be expensive.

View File

@ -10,6 +10,7 @@ from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
import torch
import vllm.envs as envs
from vllm import version
from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
DecodingConfig, DeviceConfig, HfOverrides,
KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
@ -188,6 +189,7 @@ class EngineArgs:
qlora_adapter_name_or_path: Optional[str] = None
disable_logprobs_during_spec_decoding: Optional[bool] = None
show_hidden_metrics_for_version: Optional[str] = None
otlp_traces_endpoint: Optional[str] = None
collect_detailed_traces: Optional[str] = None
disable_async_output_proc: bool = False
@ -909,6 +911,18 @@ class EngineArgs:
default=None,
help='Name or path of the QLoRA adapter.')
parser.add_argument('--show-hidden-metrics-for-version',
type=str,
default=None,
help='Enable deprecated Prometheus metrics that '
'have been hidden since the specified version. '
'For example, if a previously deprecated metric '
'has been hidden since the v0.7.0 release, you '
'use --show-hidden-metrics-for-version=0.7 as a '
'temporary escape hatch while you migrate to new '
'metrics. The metric is likely to be removed '
'completely in an upcoming release.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
@ -1317,6 +1331,11 @@ class EngineArgs:
decoding_config = DecodingConfig(
guided_decoding_backend=self.guided_decoding_backend)
show_hidden_metrics = False
if self.show_hidden_metrics_for_version is not None:
show_hidden_metrics = version._prev_minor_version_was(
self.show_hidden_metrics_for_version)
detailed_trace_modules = []
if self.collect_detailed_traces is not None:
detailed_trace_modules = self.collect_detailed_traces.split(",")
@ -1326,6 +1345,7 @@ class EngineArgs:
f"Invalid module {m} in collect_detailed_traces. "
f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
observability_config = ObservabilityConfig(
show_hidden_metrics=show_hidden_metrics,
otlp_traces_endpoint=self.otlp_traces_endpoint,
collect_model_forward_time="model" in detailed_trace_modules
or "all" in detailed_trace_modules,

View File

@ -516,6 +516,11 @@ class PrometheusStatLogger(StatLoggerBase):
self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
vllm_config=vllm_config)
# Use this flag to hide metrics that were deprecated in
# a previous release and which will be removed future
self.show_hidden_metrics = \
vllm_config.observability_config.show_hidden_metrics
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
gauge.labels(**self.labels).set(data)

View File

@ -95,6 +95,11 @@ class PrometheusStatLogger(StatLoggerBase):
def __init__(self, vllm_config: VllmConfig):
self._unregister_vllm_metrics()
# Use this flag to hide metrics that were deprecated in
# a previous release and which will be removed future
self.show_hidden_metrics = \
vllm_config.observability_config.show_hidden_metrics
labelnames = ["model_name"]
labelvalues = [vllm_config.model_config.served_model_name]

View File

@ -11,3 +11,21 @@ except Exception as e:
__version__ = "dev"
__version_tuple__ = (0, 0, __version__)
def _prev_minor_version_was(version_str):
"""Check whether a given version matches the previous minor version.
Return True if version_str matches the previous minor version.
For example - return True if the current version if 0.7.4 and the
supplied version_str is '0.6'.
Used for --show-hidden-metrics-for-version.
"""
# Match anything if this is a dev tree
if __version_tuple__[0:2] == (0, 0):
return True
# Note - this won't do the right thing when we release 1.0!
return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"