From 4078052f09f42f898b542e18d60d15a43db67a8b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 30 Jan 2025 18:07:19 -0500 Subject: [PATCH] [V1][Log] Add max request concurrency log to V1 (#12569) Signed-off-by: mgoin --- vllm/v1/core/kv_cache_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index bab99fe3..dbdda51a 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -393,6 +393,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, num_blocks = num_gpu_blocks_override logger.info("# GPU blocks: %d", num_blocks) + max_concurrency = (num_blocks * vllm_config.cache_config.block_size / + vllm_config.model_config.max_model_len) + logger.info("Maximum concurrency for %s tokens per request: %.2fx", + vllm_config.model_config.max_model_len, max_concurrency) per_layer_size = page_size * num_blocks