[Metrics] Hide deprecated metrics (#15458)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
parent
594a8b9030
commit
98d7367b61
@ -13,9 +13,12 @@ import requests
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import version
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
PREV_MINOR_VERSION = version._prev_minor_version()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[True, False])
|
||||
@ -55,6 +58,7 @@ def default_server_args():
|
||||
"",
|
||||
"--enable-chunked-prefill",
|
||||
"--disable-frontend-multiprocessing",
|
||||
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
|
||||
])
|
||||
def server(use_v1, default_server_args, request):
|
||||
if request.param:
|
||||
@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
|
||||
|
||||
# Loop over all expected metric_families
|
||||
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
|
||||
if use_v1 and metric_family not in EXPECTED_METRICS_V1:
|
||||
if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
|
||||
or (not server.show_hidden_metrics
|
||||
and metric_family in HIDDEN_DEPRECATED_METRICS)):
|
||||
continue
|
||||
|
||||
found_metric = False
|
||||
@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
|
||||
|
||||
EXPECTED_METRICS = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_swapped",
|
||||
"vllm:num_requests_swapped", # deprecated
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:cpu_cache_usage_perc",
|
||||
"vllm:cpu_cache_usage_perc", # deprecated
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
@ -268,6 +274,11 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS = [
|
||||
"vllm:num_requests_swapped",
|
||||
"vllm:cpu_cache_usage_perc",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_exist(server: RemoteOpenAIServer,
|
||||
@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
|
||||
assert metric in response.text
|
||||
if (not server.show_hidden_metrics
|
||||
and metric not in HIDDEN_DEPRECATED_METRICS):
|
||||
assert metric in response.text
|
||||
|
||||
|
||||
def test_metrics_exist_run_batch(use_v1: bool):
|
||||
|
@ -104,6 +104,9 @@ class RemoteOpenAIServer:
|
||||
self.host = str(args.host or 'localhost')
|
||||
self.port = int(args.port)
|
||||
|
||||
self.show_hidden_metrics = \
|
||||
args.show_hidden_metrics_for_version is not None
|
||||
|
||||
# download the model before starting the server to avoid timeout
|
||||
is_local = os.path.isdir(model)
|
||||
if not is_local:
|
||||
|
@ -52,6 +52,11 @@ class Metrics:
|
||||
|
||||
max_model_len = vllm_config.model_config.max_model_len
|
||||
|
||||
# Use this flag to hide metrics that were deprecated in
|
||||
# a previous release and which will be removed future
|
||||
self.show_hidden_metrics = \
|
||||
vllm_config.observability_config.show_hidden_metrics
|
||||
|
||||
# System stats
|
||||
# Scheduler State
|
||||
self.gauge_scheduler_running = self._gauge_cls(
|
||||
@ -76,14 +81,15 @@ class Metrics:
|
||||
)
|
||||
|
||||
# Deprecated in 0.8 - KV cache offloading is not used in V1
|
||||
# TODO: in 0.9, only enable if show_hidden_metrics=True
|
||||
self.gauge_scheduler_swapped = self._gauge_cls(
|
||||
name="vllm:num_requests_swapped",
|
||||
documentation=(
|
||||
"Number of requests swapped to CPU. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.gauge_scheduler_swapped = self._gauge_cls(
|
||||
name="vllm:num_requests_swapped",
|
||||
documentation=(
|
||||
"Number of requests swapped to CPU. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# KV Cache Usage in %
|
||||
self.gauge_gpu_cache_usage = self._gauge_cls(
|
||||
@ -93,34 +99,33 @@ class Metrics:
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Deprecated in 0.8 - KV cache offloading is not used in V1
|
||||
# TODO: in 0.9, only enable if show_hidden_metrics=True
|
||||
self.gauge_cpu_cache_usage = self._gauge_cls(
|
||||
name="vllm:cpu_cache_usage_perc",
|
||||
documentation=(
|
||||
"CPU KV-cache usage. 1 means 100 percent usage. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Deprecated in 0.8 - KV cache offloading is not used in V1
|
||||
# TODO: in 0.9, only enable if show_hidden_metrics=True
|
||||
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
|
||||
name="vllm:cpu_prefix_cache_hit_rate",
|
||||
documentation=(
|
||||
"CPU prefix cache block hit rate. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.gauge_cpu_cache_usage = self._gauge_cls(
|
||||
name="vllm:cpu_cache_usage_perc",
|
||||
documentation=(
|
||||
"CPU KV-cache usage. 1 means 100 percent usage. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
|
||||
name="vllm:cpu_prefix_cache_hit_rate",
|
||||
documentation=(
|
||||
"CPU prefix cache block hit rate. "
|
||||
"DEPRECATED: KV cache offloading is not used in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Deprecated in 0.8 - replaced by queries+hits counters in V1
|
||||
# TODO: in 0.9, only enable if show_hidden_metrics=True
|
||||
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
|
||||
name="vllm:gpu_prefix_cache_hit_rate",
|
||||
documentation=("GPU prefix cache block hit rate. "
|
||||
"DEPRECATED: use vllm:gpu_prefix_cache_queries and "
|
||||
"vllm:gpu_prefix_cache_queries in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
|
||||
name="vllm:gpu_prefix_cache_hit_rate",
|
||||
documentation=("GPU prefix cache block hit rate. "
|
||||
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
|
||||
"and vllm:gpu_prefix_cache_queries in V1"),
|
||||
labelnames=labelnames,
|
||||
multiprocess_mode="sum")
|
||||
|
||||
# Iteration stats
|
||||
self.counter_num_preemption = self._counter_cls(
|
||||
@ -198,33 +203,35 @@ class Metrics:
|
||||
labelnames=labelnames,
|
||||
buckets=request_latency_buckets)
|
||||
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
|
||||
# TODO: in 0.9, only enable if show_hidden_metrics=True
|
||||
self.histogram_time_in_queue_request = self._histogram_cls(
|
||||
name="vllm:time_in_queue_requests",
|
||||
documentation=(
|
||||
"Histogram of time the request spent in the queue in seconds. "
|
||||
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
|
||||
labelnames=labelnames,
|
||||
buckets=request_latency_buckets)
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.histogram_time_in_queue_request = self._histogram_cls(
|
||||
name="vllm:time_in_queue_requests",
|
||||
documentation=
|
||||
("Histogram of time the request spent in the queue in seconds. "
|
||||
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
|
||||
labelnames=labelnames,
|
||||
buckets=request_latency_buckets)
|
||||
|
||||
# Deprecated in 0.8 - use prefill/decode/inference time metrics
|
||||
# TODO: in 0.9, only enable if show_hidden_metrics=True
|
||||
self.histogram_model_forward_time_request = self._histogram_cls(
|
||||
name="vllm:model_forward_time_milliseconds",
|
||||
documentation=(
|
||||
"Histogram of time spent in the model forward pass in ms. "
|
||||
"DEPRECATED: use prefill/decode/inference time metrics instead."
|
||||
),
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_3_5_8_buckets(3000))
|
||||
self.histogram_model_execute_time_request = self._histogram_cls(
|
||||
name="vllm:model_execute_time_milliseconds",
|
||||
documentation=(
|
||||
"Histogram of time spent in the model execute function in ms."
|
||||
"DEPRECATED: use prefill/decode/inference time metrics instead."
|
||||
),
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_3_5_8_buckets(3000))
|
||||
# Hidden in 0.9, due to be removed in 0.10
|
||||
if self.show_hidden_metrics:
|
||||
self.histogram_model_forward_time_request = self._histogram_cls(
|
||||
name="vllm:model_forward_time_milliseconds",
|
||||
documentation=
|
||||
("Histogram of time spent in the model forward pass in ms. "
|
||||
"DEPRECATED: use prefill/decode/inference time metrics instead"
|
||||
),
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_3_5_8_buckets(3000))
|
||||
self.histogram_model_execute_time_request = self._histogram_cls(
|
||||
name="vllm:model_execute_time_milliseconds",
|
||||
documentation=
|
||||
("Histogram of time spent in the model execute function in ms."
|
||||
"DEPRECATED: use prefill/decode/inference time metrics instead"
|
||||
),
|
||||
labelnames=labelnames,
|
||||
buckets=build_1_2_3_5_8_buckets(3000))
|
||||
|
||||
# Metadata
|
||||
self.histogram_num_prompt_tokens_request = self._histogram_cls(
|
||||
@ -543,11 +550,6 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
|
||||
vllm_config=vllm_config)
|
||||
|
||||
# Use this flag to hide metrics that were deprecated in
|
||||
# a previous release and which will be removed future
|
||||
self.show_hidden_metrics = \
|
||||
vllm_config.observability_config.show_hidden_metrics
|
||||
|
||||
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
|
||||
# Convenience function for logging to gauge.
|
||||
gauge.labels(**self.labels).set(data)
|
||||
@ -580,18 +582,20 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
# System state data
|
||||
self._log_gauge(self.metrics.gauge_scheduler_running,
|
||||
stats.num_running_sys)
|
||||
self._log_gauge(self.metrics.gauge_scheduler_swapped,
|
||||
stats.num_swapped_sys)
|
||||
if self.metrics.show_hidden_metrics:
|
||||
self._log_gauge(self.metrics.gauge_scheduler_swapped,
|
||||
stats.num_swapped_sys)
|
||||
self._log_gauge(self.metrics.gauge_scheduler_waiting,
|
||||
stats.num_waiting_sys)
|
||||
self._log_gauge(self.metrics.gauge_gpu_cache_usage,
|
||||
stats.gpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
|
||||
stats.cpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
|
||||
stats.cpu_prefix_cache_hit_rate)
|
||||
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
|
||||
stats.gpu_prefix_cache_hit_rate)
|
||||
if self.metrics.show_hidden_metrics:
|
||||
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
|
||||
stats.cpu_cache_usage_sys)
|
||||
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
|
||||
stats.cpu_prefix_cache_hit_rate)
|
||||
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
|
||||
stats.gpu_prefix_cache_hit_rate)
|
||||
# Including max-lora in metric, in future this property of lora
|
||||
# config maybe extended to be dynamic.
|
||||
lora_info = {
|
||||
@ -629,12 +633,15 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
stats.time_prefill_requests)
|
||||
self._log_histogram(self.metrics.histogram_decode_time_request,
|
||||
stats.time_decode_requests)
|
||||
self._log_histogram(self.metrics.histogram_time_in_queue_request,
|
||||
stats.time_in_queue_requests)
|
||||
self._log_histogram(self.metrics.histogram_model_forward_time_request,
|
||||
stats.model_forward_time_requests)
|
||||
self._log_histogram(self.metrics.histogram_model_execute_time_request,
|
||||
stats.model_execute_time_requests)
|
||||
if self.metrics.show_hidden_metrics:
|
||||
self._log_histogram(self.metrics.histogram_time_in_queue_request,
|
||||
stats.time_in_queue_requests)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_model_forward_time_request,
|
||||
stats.model_forward_time_requests)
|
||||
self._log_histogram(
|
||||
self.metrics.histogram_model_execute_time_request,
|
||||
stats.model_execute_time_requests)
|
||||
# Metadata
|
||||
finished_reason_counter = CollectionsCounter(
|
||||
stats.finished_reason_requests)
|
||||
|
@ -28,4 +28,13 @@ def _prev_minor_version_was(version_str):
|
||||
return True
|
||||
|
||||
# Note - this won't do the right thing when we release 1.0!
|
||||
assert __version_tuple__[0] == 0
|
||||
assert isinstance(__version_tuple__[1], int)
|
||||
return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
|
||||
|
||||
|
||||
def _prev_minor_version():
|
||||
"""For the purpose of testing, return a previous minor version number."""
|
||||
# In dev tree, this will return "0.-1", but that will work fine"
|
||||
assert isinstance(__version_tuple__[1], int)
|
||||
return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
|
||||
|
Loading…
x
Reference in New Issue
Block a user