[Misc]: Add support for goodput on guided benchmarking + TPOT calculation refactor (#13736)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
This commit is contained in:
parent
215bf150a6
commit
ec8a5e5386
@ -9,7 +9,7 @@ On the server side, run one of the following commands:
|
|||||||
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving_guided.py \
|
||||||
--backend <backend> \
|
--backend <backend> \
|
||||||
--model <your_model> \
|
--model <your_model> \
|
||||||
--dataset json \
|
--dataset json \
|
||||||
@ -31,7 +31,7 @@ import random
|
|||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import AsyncGenerator, List, Optional, Tuple
|
from typing import AsyncGenerator, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -264,6 +264,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: List[str],
|
||||||
selected_percentiles: List[float],
|
selected_percentiles: List[float],
|
||||||
|
goodput_config_dict: Optional[Dict[str, float]] = None,
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
actual_output_lens: List[int] = []
|
actual_output_lens: List[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -287,10 +288,10 @@ def calculate_metrics(
|
|||||||
total_input += input_requests[i].prompt_len
|
total_input += input_requests[i].prompt_len
|
||||||
tpot = 0
|
tpot = 0
|
||||||
if output_len > 1:
|
if output_len > 1:
|
||||||
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
|
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
|
||||||
1)
|
tpot = latency_minus_ttft / (output_len - 1)
|
||||||
tpots.append(tpot)
|
tpots.append(tpot)
|
||||||
outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
|
outputs[i].tpot = tpot
|
||||||
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
||||||
all_tpots.append(tpot)
|
all_tpots.append(tpot)
|
||||||
itls += outputs[i].itl
|
itls += outputs[i].itl
|
||||||
@ -300,6 +301,28 @@ def calculate_metrics(
|
|||||||
else:
|
else:
|
||||||
actual_output_lens.append(0)
|
actual_output_lens.append(0)
|
||||||
|
|
||||||
|
if goodput_config_dict:
|
||||||
|
valid_metrics = []
|
||||||
|
slo_values = []
|
||||||
|
|
||||||
|
if "ttft" in goodput_config_dict:
|
||||||
|
valid_metrics.append(ttfts)
|
||||||
|
slo_values.append(goodput_config_dict["ttft"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
if "tpot" in goodput_config_dict:
|
||||||
|
valid_metrics.append(all_tpots)
|
||||||
|
slo_values.append(goodput_config_dict["tpot"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
if "e2el" in goodput_config_dict:
|
||||||
|
valid_metrics.append(e2els)
|
||||||
|
slo_values.append(goodput_config_dict["e2el"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
|
||||||
|
for req_metric in zip(*valid_metrics):
|
||||||
|
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||||
|
if is_good_req:
|
||||||
|
good_completed += 1
|
||||||
|
|
||||||
if completed == 0:
|
if completed == 0:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"All requests failed. This is likely due to a misconfiguration "
|
"All requests failed. This is likely due to a misconfiguration "
|
||||||
@ -356,6 +379,7 @@ async def benchmark(
|
|||||||
max_concurrency: Optional[int],
|
max_concurrency: Optional[int],
|
||||||
guided_decoding_ratio: float,
|
guided_decoding_ratio: float,
|
||||||
guided_decoding_backend: str,
|
guided_decoding_backend: str,
|
||||||
|
goodput_config_dict: Optional[Dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -483,6 +507,7 @@ async def benchmark(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
selected_percentile_metrics=selected_percentile_metrics,
|
selected_percentile_metrics=selected_percentile_metrics,
|
||||||
selected_percentiles=selected_percentiles,
|
selected_percentiles=selected_percentiles,
|
||||||
|
goodput_config_dict=goodput_config_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||||
@ -494,6 +519,9 @@ async def benchmark(
|
|||||||
metrics.total_output))
|
metrics.total_output))
|
||||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||||
metrics.request_throughput))
|
metrics.request_throughput))
|
||||||
|
if goodput_config_dict:
|
||||||
|
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||||
|
metrics.request_goodput))
|
||||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||||
metrics.output_throughput))
|
metrics.output_throughput))
|
||||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||||
@ -617,6 +645,40 @@ def evaluate(ret, args):
|
|||||||
100) if len(not_none_scores) > 0 else None
|
100) if len(not_none_scores) > 0 else None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_goodput(slo_pairs):
|
||||||
|
goodput_config_dict = {}
|
||||||
|
try:
|
||||||
|
for slo_pair in slo_pairs:
|
||||||
|
slo_name, slo_val = slo_pair.split(":")
|
||||||
|
goodput_config_dict[slo_name] = float(slo_val)
|
||||||
|
except ValueError as err:
|
||||||
|
raise argparse.ArgumentTypeError(
|
||||||
|
"Invalid format found for service level objectives. "
|
||||||
|
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||||
|
"pairs, where the key is a metric name, and the value is a "
|
||||||
|
"number in milliseconds.") from err
|
||||||
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
|
def check_goodput_args(args):
|
||||||
|
goodput_config_dict = {}
|
||||||
|
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
||||||
|
if args.goodput:
|
||||||
|
goodput_config_dict = parse_goodput(args.goodput)
|
||||||
|
for slo_name, slo_val in goodput_config_dict.items():
|
||||||
|
if slo_name not in VALID_NAMES:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||||
|
"The service level objective name should be one of "
|
||||||
|
f"{str(VALID_NAMES)}. ")
|
||||||
|
if slo_val < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||||
|
"The service level objective value should be "
|
||||||
|
"non-negative.")
|
||||||
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@ -661,6 +723,8 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
input_requests = sample_requests(tokenizer, args)
|
input_requests = sample_requests(tokenizer, args)
|
||||||
|
|
||||||
|
goodput_config_dict = check_goodput_args(args)
|
||||||
|
|
||||||
benchmark_result, ret = asyncio.run(
|
benchmark_result, ret = asyncio.run(
|
||||||
benchmark(
|
benchmark(
|
||||||
backend=backend,
|
backend=backend,
|
||||||
@ -681,6 +745,7 @@ def main(args: argparse.Namespace):
|
|||||||
max_concurrency=args.max_concurrency,
|
max_concurrency=args.max_concurrency,
|
||||||
guided_decoding_ratio=args.guided_decoding_ratio,
|
guided_decoding_ratio=args.guided_decoding_ratio,
|
||||||
guided_decoding_backend=args.guided_decoding_backend,
|
guided_decoding_backend=args.guided_decoding_backend,
|
||||||
|
goodput_config_dict=goodput_config_dict,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
@ -865,6 +930,18 @@ if __name__ == "__main__":
|
|||||||
"Default value is \"99\". "
|
"Default value is \"99\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--goodput",
|
||||||
|
nargs="+",
|
||||||
|
required=False,
|
||||||
|
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||||
|
"pairs, where the key is a metric name, and the value is in "
|
||||||
|
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||||
|
"separated by spaces. Allowed request level metric names are "
|
||||||
|
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||||
|
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||||
|
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||||
|
|
||||||
parser.add_argument("--no-guided-decoding",
|
parser.add_argument("--no-guided-decoding",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user