[Misc]: Add support for goodput on guided benchmarking + TPOT calculation refactor (#13736)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
This commit is contained in:
parent
215bf150a6
commit
ec8a5e5386
@ -9,7 +9,7 @@ On the server side, run one of the following commands:
|
||||
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
||||
|
||||
On the client side, run:
|
||||
python benchmarks/benchmark_serving.py \
|
||||
python benchmarks/benchmark_serving_guided.py \
|
||||
--backend <backend> \
|
||||
--model <your_model> \
|
||||
--dataset json \
|
||||
@ -31,7 +31,7 @@ import random
|
||||
import time
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import AsyncGenerator, List, Optional, Tuple
|
||||
from typing import AsyncGenerator, Dict, List, Optional, Tuple
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
@ -264,6 +264,7 @@ def calculate_metrics(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
selected_percentile_metrics: List[str],
|
||||
selected_percentiles: List[float],
|
||||
goodput_config_dict: Optional[Dict[str, float]] = None,
|
||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||
actual_output_lens: List[int] = []
|
||||
total_input = 0
|
||||
@ -287,10 +288,10 @@ def calculate_metrics(
|
||||
total_input += input_requests[i].prompt_len
|
||||
tpot = 0
|
||||
if output_len > 1:
|
||||
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
|
||||
1)
|
||||
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
|
||||
tpot = latency_minus_ttft / (output_len - 1)
|
||||
tpots.append(tpot)
|
||||
outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
|
||||
outputs[i].tpot = tpot
|
||||
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
||||
all_tpots.append(tpot)
|
||||
itls += outputs[i].itl
|
||||
@ -300,6 +301,28 @@ def calculate_metrics(
|
||||
else:
|
||||
actual_output_lens.append(0)
|
||||
|
||||
if goodput_config_dict:
|
||||
valid_metrics = []
|
||||
slo_values = []
|
||||
|
||||
if "ttft" in goodput_config_dict:
|
||||
valid_metrics.append(ttfts)
|
||||
slo_values.append(goodput_config_dict["ttft"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "tpot" in goodput_config_dict:
|
||||
valid_metrics.append(all_tpots)
|
||||
slo_values.append(goodput_config_dict["tpot"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "e2el" in goodput_config_dict:
|
||||
valid_metrics.append(e2els)
|
||||
slo_values.append(goodput_config_dict["e2el"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
|
||||
for req_metric in zip(*valid_metrics):
|
||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||
if is_good_req:
|
||||
good_completed += 1
|
||||
|
||||
if completed == 0:
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
@ -356,6 +379,7 @@ async def benchmark(
|
||||
max_concurrency: Optional[int],
|
||||
guided_decoding_ratio: float,
|
||||
guided_decoding_backend: str,
|
||||
goodput_config_dict: Optional[Dict[str, float]] = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
@ -483,6 +507,7 @@ async def benchmark(
|
||||
tokenizer=tokenizer,
|
||||
selected_percentile_metrics=selected_percentile_metrics,
|
||||
selected_percentiles=selected_percentiles,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
)
|
||||
|
||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||
@ -494,6 +519,9 @@ async def benchmark(
|
||||
metrics.total_output))
|
||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||
metrics.request_throughput))
|
||||
if goodput_config_dict:
|
||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||
metrics.request_goodput))
|
||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||
metrics.output_throughput))
|
||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||
@ -617,6 +645,40 @@ def evaluate(ret, args):
|
||||
100) if len(not_none_scores) > 0 else None
|
||||
|
||||
|
||||
def parse_goodput(slo_pairs):
|
||||
goodput_config_dict = {}
|
||||
try:
|
||||
for slo_pair in slo_pairs:
|
||||
slo_name, slo_val = slo_pair.split(":")
|
||||
goodput_config_dict[slo_name] = float(slo_val)
|
||||
except ValueError as err:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Invalid format found for service level objectives. "
|
||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is a "
|
||||
"number in milliseconds.") from err
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
def check_goodput_args(args):
|
||||
goodput_config_dict = {}
|
||||
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
||||
if args.goodput:
|
||||
goodput_config_dict = parse_goodput(args.goodput)
|
||||
for slo_name, slo_val in goodput_config_dict.items():
|
||||
if slo_name not in VALID_NAMES:
|
||||
raise ValueError(
|
||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||
"The service level objective name should be one of "
|
||||
f"{str(VALID_NAMES)}. ")
|
||||
if slo_val < 0:
|
||||
raise ValueError(
|
||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||
"The service level objective value should be "
|
||||
"non-negative.")
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
random.seed(args.seed)
|
||||
@ -661,6 +723,8 @@ def main(args: argparse.Namespace):
|
||||
|
||||
input_requests = sample_requests(tokenizer, args)
|
||||
|
||||
goodput_config_dict = check_goodput_args(args)
|
||||
|
||||
benchmark_result, ret = asyncio.run(
|
||||
benchmark(
|
||||
backend=backend,
|
||||
@ -681,6 +745,7 @@ def main(args: argparse.Namespace):
|
||||
max_concurrency=args.max_concurrency,
|
||||
guided_decoding_ratio=args.guided_decoding_ratio,
|
||||
guided_decoding_backend=args.guided_decoding_backend,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
@ -865,6 +930,18 @@ if __name__ == "__main__":
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--goodput",
|
||||
nargs="+",
|
||||
required=False,
|
||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is in "
|
||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||
"separated by spaces. Allowed request level metric names are "
|
||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||
|
||||
parser.add_argument("--no-guided-decoding",
|
||||
action='store_true',
|
||||
default=False,
|
||||
|
Loading…
x
Reference in New Issue
Block a user