[V1][Metrics] Add TTFT and TPOT histograms (#12530)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
parent
dd6a3a02cb
commit
46fb056749
@ -208,6 +208,12 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_generation_tokens_sum",
|
||||
"vllm:request_generation_tokens_bucket",
|
||||
"vllm:request_generation_tokens_count",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
]
|
||||
|
||||
|
||||
|
@ -27,6 +27,7 @@ class RequestState:
|
||||
prompt: Optional[str],
|
||||
prompt_token_ids: List[int],
|
||||
detokenizer: IncrementalDetokenizer,
|
||||
arrival_time: float,
|
||||
queue: Optional[asyncio.Queue[RequestOutput]],
|
||||
):
|
||||
self.request_id = request_id
|
||||
@ -37,7 +38,7 @@ class RequestState:
|
||||
self.is_prefilling = True
|
||||
self.queue = queue
|
||||
|
||||
self.stats = RequestStateStats()
|
||||
self.stats = RequestStateStats(last_token_time=arrival_time)
|
||||
|
||||
@classmethod
|
||||
def from_new_request(
|
||||
@ -54,6 +55,7 @@ class RequestState:
|
||||
tokenizer=tokenizer,
|
||||
request=request,
|
||||
),
|
||||
arrival_time=request.arrival_time,
|
||||
queue=queue,
|
||||
)
|
||||
|
||||
|
@ -121,6 +121,26 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
buckets=build_1_2_5_buckets(max_model_len),
|
||||
labelnames=labelnames).labels(*labelvalues)
|
||||
|
||||
self.histogram_time_to_first_token = \
|
||||
prometheus_client.Histogram(
|
||||
name="vllm:time_to_first_token_seconds",
|
||||
documentation="Histogram of time to first token in seconds.",
|
||||
buckets=[
|
||||
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
|
||||
0.75, 1.0, 2.5, 5.0, 7.5, 10.0
|
||||
],
|
||||
labelnames=labelnames).labels(*labelvalues)
|
||||
|
||||
self.histogram_time_per_output_token = \
|
||||
prometheus_client.Histogram(
|
||||
name="vllm:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
buckets=[
|
||||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
|
||||
0.75, 1.0, 2.5
|
||||
],
|
||||
labelnames=labelnames).labels(*labelvalues)
|
||||
|
||||
def log(self, scheduler_stats: SchedulerStats,
|
||||
iteration_stats: IterationStats):
|
||||
"""Log to prometheus."""
|
||||
@ -137,6 +157,11 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.histogram_num_generation_tokens_request.observe(
|
||||
finished_request.num_generation_tokens)
|
||||
|
||||
for ttft in iteration_stats.time_to_first_tokens_iter:
|
||||
self.histogram_time_to_first_token.observe(ttft)
|
||||
for tpot in iteration_stats.time_per_output_tokens_iter:
|
||||
self.histogram_time_per_output_token.observe(tpot)
|
||||
|
||||
@staticmethod
|
||||
def _unregister_vllm_metrics():
|
||||
# Unregister any existing vLLM collectors (for CI/CD
|
||||
|
@ -1,3 +1,4 @@
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, List
|
||||
|
||||
@ -22,6 +23,7 @@ class RequestStateStats:
|
||||
"""Stats that need to be tracked across delta updates."""
|
||||
|
||||
num_generation_tokens: int = 0
|
||||
last_token_time: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -40,6 +42,8 @@ class IterationStats:
|
||||
self.num_generation_tokens = 0
|
||||
self.num_prompt_tokens = 0
|
||||
self.finished_requests: List[FinishedRequestStats] = []
|
||||
self.time_to_first_tokens_iter: List[float] = []
|
||||
self.time_per_output_tokens_iter: List[float] = []
|
||||
|
||||
def update_from_output(self, output: "EngineCoreOutput",
|
||||
is_prefilling: bool, prompt_len: int,
|
||||
@ -48,6 +52,8 @@ class IterationStats:
|
||||
return
|
||||
|
||||
num_new_generation_tokens = len(output.new_token_ids)
|
||||
now = time.time()
|
||||
last_token_latency = now - request_state_stats.last_token_time
|
||||
|
||||
self.num_generation_tokens += num_new_generation_tokens
|
||||
if is_prefilling:
|
||||
@ -58,7 +64,12 @@ class IterationStats:
|
||||
assert (num_new_generation_tokens > 0)
|
||||
self.num_prompt_tokens += prompt_len
|
||||
|
||||
self.time_to_first_tokens_iter.append(last_token_latency)
|
||||
else:
|
||||
self.time_per_output_tokens_iter.append(last_token_latency)
|
||||
|
||||
request_state_stats.num_generation_tokens += num_new_generation_tokens
|
||||
request_state_stats.last_token_time = now
|
||||
|
||||
def update_from_finished_request(self, request_output: "RequestOutput",
|
||||
request_state_stats: RequestStateStats):
|
||||
|
Loading…
x
Reference in New Issue
Block a user