From 98d7367b614bf4bb2189e849723df24156de67d0 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 2 Apr 2025 15:37:19 +0100 Subject: [PATCH] [Metrics] Hide deprecated metrics (#15458) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 21 ++- tests/utils.py | 3 + vllm/engine/metrics.py | 163 ++++++++++++----------- vllm/version.py | 9 ++ 4 files changed, 114 insertions(+), 82 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 2bffd0ce..42f7b098 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -13,9 +13,12 @@ import requests from prometheus_client.parser import text_string_to_metric_families from transformers import AutoTokenizer +from vllm import version + from ...utils import RemoteOpenAIServer MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +PREV_MINOR_VERSION = version._prev_minor_version() @pytest.fixture(scope="module", params=[True, False]) @@ -55,6 +58,7 @@ def default_server_args(): "", "--enable-chunked-prefill", "--disable-frontend-multiprocessing", + f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}", ]) def server(use_v1, default_server_args, request): if request.param: @@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer, # Loop over all expected metric_families for metric_family, suffix_values_list in EXPECTED_VALUES.items(): - if use_v1 and metric_family not in EXPECTED_METRICS_V1: + if ((use_v1 and metric_family not in EXPECTED_METRICS_V1) + or (not server.show_hidden_metrics + and metric_family in HIDDEN_DEPRECATED_METRICS)): continue found_metric = False @@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer, EXPECTED_METRICS = [ "vllm:num_requests_running", - "vllm:num_requests_swapped", + "vllm:num_requests_swapped", # deprecated "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", - "vllm:cpu_cache_usage_perc", + "vllm:cpu_cache_usage_perc", # deprecated "vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_count", @@ -268,6 +274,11 @@ EXPECTED_METRICS_V1 = [ "vllm:request_decode_time_seconds_count", ] +HIDDEN_DEPRECATED_METRICS = [ + "vllm:num_requests_swapped", + "vllm:cpu_cache_usage_perc", +] + @pytest.mark.asyncio async def test_metrics_exist(server: RemoteOpenAIServer, @@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer, assert response.status_code == HTTPStatus.OK for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): - assert metric in response.text + if (not server.show_hidden_metrics + and metric not in HIDDEN_DEPRECATED_METRICS): + assert metric in response.text def test_metrics_exist_run_batch(use_v1: bool): diff --git a/tests/utils.py b/tests/utils.py index 69c96d3f..8f8c102b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -104,6 +104,9 @@ class RemoteOpenAIServer: self.host = str(args.host or 'localhost') self.port = int(args.port) + self.show_hidden_metrics = \ + args.show_hidden_metrics_for_version is not None + # download the model before starting the server to avoid timeout is_local = os.path.isdir(model) if not is_local: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 70f36d12..5890f654 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -52,6 +52,11 @@ class Metrics: max_model_len = vllm_config.model_config.max_model_len + # Use this flag to hide metrics that were deprecated in + # a previous release and which will be removed future + self.show_hidden_metrics = \ + vllm_config.observability_config.show_hidden_metrics + # System stats # Scheduler State self.gauge_scheduler_running = self._gauge_cls( @@ -76,14 +81,15 @@ class Metrics: ) # Deprecated in 0.8 - KV cache offloading is not used in V1 - # TODO: in 0.9, only enable if show_hidden_metrics=True - self.gauge_scheduler_swapped = self._gauge_cls( - name="vllm:num_requests_swapped", - documentation=( - "Number of requests swapped to CPU. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") + # Hidden in 0.9, due to be removed in 0.10 + if self.show_hidden_metrics: + self.gauge_scheduler_swapped = self._gauge_cls( + name="vllm:num_requests_swapped", + documentation=( + "Number of requests swapped to CPU. " + "DEPRECATED: KV cache offloading is not used in V1"), + labelnames=labelnames, + multiprocess_mode="sum") # KV Cache Usage in % self.gauge_gpu_cache_usage = self._gauge_cls( @@ -93,34 +99,33 @@ class Metrics: multiprocess_mode="sum") # Deprecated in 0.8 - KV cache offloading is not used in V1 - # TODO: in 0.9, only enable if show_hidden_metrics=True - self.gauge_cpu_cache_usage = self._gauge_cls( - name="vllm:cpu_cache_usage_perc", - documentation=( - "CPU KV-cache usage. 1 means 100 percent usage. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - - # Deprecated in 0.8 - KV cache offloading is not used in V1 - # TODO: in 0.9, only enable if show_hidden_metrics=True - self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:cpu_prefix_cache_hit_rate", - documentation=( - "CPU prefix cache block hit rate. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") + # Hidden in 0.9, due to be removed in 0.10 + if self.show_hidden_metrics: + self.gauge_cpu_cache_usage = self._gauge_cls( + name="vllm:cpu_cache_usage_perc", + documentation=( + "CPU KV-cache usage. 1 means 100 percent usage. " + "DEPRECATED: KV cache offloading is not used in V1"), + labelnames=labelnames, + multiprocess_mode="sum") + self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( + name="vllm:cpu_prefix_cache_hit_rate", + documentation=( + "CPU prefix cache block hit rate. " + "DEPRECATED: KV cache offloading is not used in V1"), + labelnames=labelnames, + multiprocess_mode="sum") # Deprecated in 0.8 - replaced by queries+hits counters in V1 - # TODO: in 0.9, only enable if show_hidden_metrics=True - self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:gpu_prefix_cache_hit_rate", - documentation=("GPU prefix cache block hit rate. " - "DEPRECATED: use vllm:gpu_prefix_cache_queries and " - "vllm:gpu_prefix_cache_queries in V1"), - labelnames=labelnames, - multiprocess_mode="sum") + # Hidden in 0.9, due to be removed in 0.10 + if self.show_hidden_metrics: + self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( + name="vllm:gpu_prefix_cache_hit_rate", + documentation=("GPU prefix cache block hit rate. " + "DEPRECATED: use vllm:gpu_prefix_cache_queries " + "and vllm:gpu_prefix_cache_queries in V1"), + labelnames=labelnames, + multiprocess_mode="sum") # Iteration stats self.counter_num_preemption = self._counter_cls( @@ -198,33 +203,35 @@ class Metrics: labelnames=labelnames, buckets=request_latency_buckets) # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds: - # TODO: in 0.9, only enable if show_hidden_metrics=True - self.histogram_time_in_queue_request = self._histogram_cls( - name="vllm:time_in_queue_requests", - documentation=( - "Histogram of time the request spent in the queue in seconds. " - "DEPRECATED: use vllm:request_queue_time_seconds instead."), - labelnames=labelnames, - buckets=request_latency_buckets) + # Hidden in 0.9, due to be removed in 0.10 + if self.show_hidden_metrics: + self.histogram_time_in_queue_request = self._histogram_cls( + name="vllm:time_in_queue_requests", + documentation= + ("Histogram of time the request spent in the queue in seconds. " + "DEPRECATED: use vllm:request_queue_time_seconds instead."), + labelnames=labelnames, + buckets=request_latency_buckets) # Deprecated in 0.8 - use prefill/decode/inference time metrics - # TODO: in 0.9, only enable if show_hidden_metrics=True - self.histogram_model_forward_time_request = self._histogram_cls( - name="vllm:model_forward_time_milliseconds", - documentation=( - "Histogram of time spent in the model forward pass in ms. " - "DEPRECATED: use prefill/decode/inference time metrics instead." - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) - self.histogram_model_execute_time_request = self._histogram_cls( - name="vllm:model_execute_time_milliseconds", - documentation=( - "Histogram of time spent in the model execute function in ms." - "DEPRECATED: use prefill/decode/inference time metrics instead." - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) + # Hidden in 0.9, due to be removed in 0.10 + if self.show_hidden_metrics: + self.histogram_model_forward_time_request = self._histogram_cls( + name="vllm:model_forward_time_milliseconds", + documentation= + ("Histogram of time spent in the model forward pass in ms. " + "DEPRECATED: use prefill/decode/inference time metrics instead" + ), + labelnames=labelnames, + buckets=build_1_2_3_5_8_buckets(3000)) + self.histogram_model_execute_time_request = self._histogram_cls( + name="vllm:model_execute_time_milliseconds", + documentation= + ("Histogram of time spent in the model execute function in ms." + "DEPRECATED: use prefill/decode/inference time metrics instead" + ), + labelnames=labelnames, + buckets=build_1_2_3_5_8_buckets(3000)) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( @@ -543,11 +550,6 @@ class PrometheusStatLogger(StatLoggerBase): self.metrics = self._metrics_cls(labelnames=list(labels.keys()), vllm_config=vllm_config) - # Use this flag to hide metrics that were deprecated in - # a previous release and which will be removed future - self.show_hidden_metrics = \ - vllm_config.observability_config.show_hidden_metrics - def _log_gauge(self, gauge, data: Union[int, float]) -> None: # Convenience function for logging to gauge. gauge.labels(**self.labels).set(data) @@ -580,18 +582,20 @@ class PrometheusStatLogger(StatLoggerBase): # System state data self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys) - self._log_gauge(self.metrics.gauge_scheduler_swapped, - stats.num_swapped_sys) + if self.metrics.show_hidden_metrics: + self._log_gauge(self.metrics.gauge_scheduler_swapped, + stats.num_swapped_sys) self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys) self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys) - self._log_gauge(self.metrics.gauge_cpu_cache_usage, - stats.cpu_cache_usage_sys) - self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate, - stats.cpu_prefix_cache_hit_rate) - self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, - stats.gpu_prefix_cache_hit_rate) + if self.metrics.show_hidden_metrics: + self._log_gauge(self.metrics.gauge_cpu_cache_usage, + stats.cpu_cache_usage_sys) + self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate, + stats.cpu_prefix_cache_hit_rate) + self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, + stats.gpu_prefix_cache_hit_rate) # Including max-lora in metric, in future this property of lora # config maybe extended to be dynamic. lora_info = { @@ -629,12 +633,15 @@ class PrometheusStatLogger(StatLoggerBase): stats.time_prefill_requests) self._log_histogram(self.metrics.histogram_decode_time_request, stats.time_decode_requests) - self._log_histogram(self.metrics.histogram_time_in_queue_request, - stats.time_in_queue_requests) - self._log_histogram(self.metrics.histogram_model_forward_time_request, - stats.model_forward_time_requests) - self._log_histogram(self.metrics.histogram_model_execute_time_request, - stats.model_execute_time_requests) + if self.metrics.show_hidden_metrics: + self._log_histogram(self.metrics.histogram_time_in_queue_request, + stats.time_in_queue_requests) + self._log_histogram( + self.metrics.histogram_model_forward_time_request, + stats.model_forward_time_requests) + self._log_histogram( + self.metrics.histogram_model_execute_time_request, + stats.model_execute_time_requests) # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) diff --git a/vllm/version.py b/vllm/version.py index ab5909b1..8329d7be 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -28,4 +28,13 @@ def _prev_minor_version_was(version_str): return True # Note - this won't do the right thing when we release 1.0! + assert __version_tuple__[0] == 0 + assert isinstance(__version_tuple__[1], int) return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}" + + +def _prev_minor_version(): + """For the purpose of testing, return a previous minor version number.""" + # In dev tree, this will return "0.-1", but that will work fine" + assert isinstance(__version_tuple__[1], int) + return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"