2023-04-01 00:51:08 +08:00
|
|
|
import argparse
|
|
|
|
import time
|
|
|
|
from typing import List
|
|
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
import numpy as np
|
|
|
|
import torch
|
|
|
|
|
2023-05-11 15:45:30 -07:00
|
|
|
from cacheflow.core.server import (
|
2023-05-03 15:32:04 +08:00
|
|
|
add_server_arguments, process_server_arguments,
|
|
|
|
init_local_server_and_frontend_with_arguments)
|
2023-04-01 00:51:08 +08:00
|
|
|
from cacheflow.sampling_params import SamplingParams
|
|
|
|
|
|
|
|
|
|
|
|
def main(args: argparse.Namespace):
|
2023-05-03 15:32:04 +08:00
|
|
|
server, frontend = init_local_server_and_frontend_with_arguments(args)
|
2023-04-01 00:51:08 +08:00
|
|
|
|
2023-05-11 15:45:30 -07:00
|
|
|
sampling_params = SamplingParams(
|
|
|
|
n=args.n,
|
|
|
|
temperature=0.0 if args.use_beam_search else 1.0,
|
|
|
|
top_p=1.0,
|
|
|
|
use_beam_search=args.use_beam_search,
|
|
|
|
stop_token_ids=set(),
|
|
|
|
max_tokens=args.output_len,
|
|
|
|
)
|
2023-04-07 17:45:07 -07:00
|
|
|
print(sampling_params)
|
2023-04-01 00:51:08 +08:00
|
|
|
input_token_ids = [0] * args.input_len
|
|
|
|
|
|
|
|
def profile_step(profile=False):
|
|
|
|
if profile:
|
|
|
|
torch.cuda.cudart().cudaProfilerStart()
|
|
|
|
for _ in range(args.batch_size):
|
2023-05-11 15:45:30 -07:00
|
|
|
dummy_prompt = ""
|
|
|
|
frontend._add_query(dummy_prompt, input_token_ids, sampling_params)
|
2023-04-01 00:51:08 +08:00
|
|
|
server.add_sequence_groups(frontend.get_inputs())
|
|
|
|
start_time = time.time()
|
|
|
|
while True:
|
|
|
|
server.step()
|
|
|
|
if not server.has_unfinished_requests():
|
|
|
|
break
|
|
|
|
end_time = time.time()
|
|
|
|
latency = end_time - start_time
|
|
|
|
if profile:
|
|
|
|
torch.cuda.cudart().cudaProfilerStop()
|
|
|
|
return latency
|
|
|
|
|
|
|
|
print("Warm up step")
|
|
|
|
profile_step()
|
|
|
|
|
|
|
|
# Benchmark.
|
|
|
|
latencies = []
|
|
|
|
for _ in tqdm(range(3), desc="Profile step"):
|
|
|
|
latencies.append(profile_step())
|
|
|
|
print(f'Avg latency: {np.mean(latencies)} seconds')
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2023-04-30 15:42:17 +08:00
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description='Benchmark the latency of decoding a single sentence.')
|
2023-04-01 00:51:08 +08:00
|
|
|
parser = add_server_arguments(parser)
|
|
|
|
parser.add_argument('--input-len', type=int, default=32)
|
|
|
|
parser.add_argument('--output-len', type=int, default=128)
|
|
|
|
parser.add_argument('--batch-size', type=int, default=8)
|
2023-04-07 17:45:07 -07:00
|
|
|
parser.add_argument('--n', type=int, default=1)
|
|
|
|
parser.add_argument('--use-beam-search', action='store_true')
|
2023-04-01 00:51:08 +08:00
|
|
|
args = parser.parse_args()
|
2023-04-30 15:42:17 +08:00
|
|
|
args = process_server_arguments(args)
|
2023-04-05 11:16:57 -07:00
|
|
|
args.max_num_batched_tokens = max(
|
|
|
|
args.max_num_batched_tokens, args.batch_size * args.input_len)
|
2023-04-01 00:51:08 +08:00
|
|
|
print(args)
|
|
|
|
main(args)
|