vllm/benchmarks/benchmark_latency.py

"""Benchmark the latency of processing a single batch of requests."""
import argparse
import time

import numpy as np
import torch
from tqdm import tqdm

from cacheflow import LLM, SamplingParams


def main(args: argparse.Namespace):
    print(args)

    # Process all the requests in a single batch if possible.
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(
        model=args.model,
        tensor_parallel_size=args.tensor_parallel_size,
        max_num_seqs=args.batch_size,
        max_num_batched_tokens=args.batch_size * args.input_len,
    )

    sampling_params = SamplingParams(
        n=args.n,
        temperature=0.0 if args.use_beam_search else 1.0,
        top_p=1.0,
        use_beam_search=args.use_beam_search,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
    print(sampling_params)
    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size

    def run_to_completion(profile: bool = False):
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.time()

        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
                     sampling_params=sampling_params,
                     use_tqdm=False)

        end_time = time.time()
        latency = end_time - start_time
        if profile:
            torch.cuda.cudart().cudaProfilerStop()
        return latency

    print("Warming up...")
    run_to_completion(profile=False)

    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile=False))
    print(f'Avg latency: {np.mean(latencies)} seconds')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Benchmark the latency of processing a single batch of '
                    'requests till completion.')
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
    parser.add_argument('--n', type=int, default=1,
                        help='Number of generated sequences per prompt.')
    parser.add_argument('--use-beam-search', action='store_true')
    parser.add_argument('--num-iters', type=int, default=3,
                        help='Number of iterations to run.')
    args = parser.parse_args()
    main(args)
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`"""Benchmark the latency of processing a single batch of requests."""`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`import argparse`
			`import time`

			`import numpy as np`
			`import torch`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`from tqdm import tqdm`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`from cacheflow import LLM, SamplingParams`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00

			`def main(args: argparse.Namespace):`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print(args)`

			`# Process all the requests in a single batch if possible.`
			`# NOTE(woosuk): If the request cannot be processed in a single batch,`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`# the engine will automatically process the request in multiple batches.`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`llm = LLM(`
			`model=args.model,`
			`tensor_parallel_size=args.tensor_parallel_size,`
			`max_num_seqs=args.batch_size,`
			`max_num_batched_tokens=args.batch_size * args.input_len,`
			`)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`sampling_params = SamplingParams(`
			`n=args.n,`
			`temperature=0.0 if args.use_beam_search else 1.0,`
			`top_p=1.0,`
			`use_beam_search=args.use_beam_search,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`ignore_eos=True,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`max_tokens=args.output_len,`
			`)`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`print(sampling_params)`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`def run_to_completion(profile: bool = False):`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`if profile:`
			`torch.cuda.cudart().cudaProfilerStart()`
			`start_time = time.time()`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00
Add docstrings for LLM (#137) 2023-06-04 12:52:41 -07:00			`llm.generate(prompt_token_ids=dummy_prompt_token_ids,`
			`sampling_params=sampling_params,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`use_tqdm=False)`

Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`end_time = time.time()`
			`latency = end_time - start_time`
			`if profile:`
			`torch.cuda.cudart().cudaProfilerStop()`
			`return latency`

Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print("Warming up...")`
			`run_to_completion(profile=False)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
			`# Benchmark.`
			`latencies = []`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):`
			`latencies.append(run_to_completion(profile=False))`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`print(f'Avg latency: {np.mean(latencies)} seconds')`


			`if __name__ == '__main__':`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`parser = argparse.ArgumentParser(`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`description='Benchmark the latency of processing a single batch of '`
			`'requests till completion.')`
			`parser.add_argument('--model', type=str, default='facebook/opt-125m')`
			`parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`parser.add_argument('--input-len', type=int, default=32)`
			`parser.add_argument('--output-len', type=int, default=128)`
			`parser.add_argument('--batch-size', type=int, default=8)`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`parser.add_argument('--n', type=int, default=1,`
			`help='Number of generated sequences per prompt.')`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`parser.add_argument('--use-beam-search', action='store_true')`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`parser.add_argument('--num-iters', type=int, default=3,`
			`help='Number of iterations to run.')`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`args = parser.parse_args()`
			`main(args)`