vllm/benchmarks/benchmark_latency.py

"""Benchmark the latency of processing a single batch of requests."""
import argparse
import time

import numpy as np
import torch
from tqdm import tqdm

from vllm import LLM, SamplingParams


def main(args: argparse.Namespace):
    print(args)

    # Process all the requests in a single batch if possible.
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(
        model=args.model,
        tokenizer=args.tokenizer,
        quantization=args.quantization,
        tensor_parallel_size=args.tensor_parallel_size,
        max_num_seqs=args.batch_size,
        max_num_batched_tokens=args.batch_size * args.input_len,
        trust_remote_code=args.trust_remote_code,
        dtype=args.dtype,
    )

    sampling_params = SamplingParams(
        n=args.n,
        temperature=0.0 if args.use_beam_search else 1.0,
        top_p=1.0,
        use_beam_search=args.use_beam_search,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
    print(sampling_params)
    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size

    def run_to_completion(profile: bool = False):
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()

        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
                     sampling_params=sampling_params,
                     use_tqdm=False)

        end_time = time.perf_counter()
        latency = end_time - start_time
        if profile:
            torch.cuda.cudart().cudaProfilerStop()
        return latency

    print("Warming up...")
    run_to_completion(profile=False)

    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile=False))
    print(f'Avg latency: {np.mean(latencies)} seconds')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
                        choices=['awq', 'squeezellm', None],
                        default=None)
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
    parser.add_argument('--n',
                        type=int,
                        default=1,
                        help='Number of generated sequences per prompt.')
    parser.add_argument('--use-beam-search', action='store_true')
    parser.add_argument('--num-iters',
                        type=int,
                        default=3,
                        help='Number of iterations to run.')
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
    parser.add_argument(
        '--dtype',
        type=str,
        default='auto',
        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
        help='data type for model weights and activations. '
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
    args = parser.parse_args()
    main(args)
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`"""Benchmark the latency of processing a single batch of requests."""`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`import argparse`
			`import time`

			`import numpy as np`
			`import torch`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`from tqdm import tqdm`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00			`from vllm import LLM, SamplingParams`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00

			`def main(args: argparse.Namespace):`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print(args)`

			`# Process all the requests in a single batch if possible.`
			`# NOTE(woosuk): If the request cannot be processed in a single batch,`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`# the engine will automatically process the request in multiple batches.`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`llm = LLM(`
			`model=args.model,`
[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00			`tokenizer=args.tokenizer,`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`quantization=args.quantization,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`tensor_parallel_size=args.tensor_parallel_size,`
			`max_num_seqs=args.batch_size,`
			`max_num_batched_tokens=args.batch_size * args.input_len,`
fix: enable trust-remote-code in api server & benchmark. (#509) 2023-07-20 08:06:15 +08:00			`trust_remote_code=args.trust_remote_code,`
Added `dtype` arg to benchmarks (#1228) 2023-10-01 00:04:03 -04:00			`dtype=args.dtype,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`sampling_params = SamplingParams(`
			`n=args.n,`
			`temperature=0.0 if args.use_beam_search else 1.0,`
			`top_p=1.0,`
			`use_beam_search=args.use_beam_search,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`ignore_eos=True,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`max_tokens=args.output_len,`
			`)`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`print(sampling_params)`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`def run_to_completion(profile: bool = False):`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`if profile:`
			`torch.cuda.cudart().cudaProfilerStart()`
Use monotonic time where appropriate (#1249) 2023-10-02 19:22:05 -07:00			`start_time = time.perf_counter()`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00
Add docstrings for LLM (#137) 2023-06-04 12:52:41 -07:00			`llm.generate(prompt_token_ids=dummy_prompt_token_ids,`
			`sampling_params=sampling_params,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`use_tqdm=False)`

Use monotonic time where appropriate (#1249) 2023-10-02 19:22:05 -07:00			`end_time = time.perf_counter()`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`latency = end_time - start_time`
			`if profile:`
			`torch.cuda.cudart().cudaProfilerStop()`
			`return latency`

Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print("Warming up...")`
			`run_to_completion(profile=False)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
			`# Benchmark.`
			`latencies = []`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):`
			`latencies.append(run_to_completion(profile=False))`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`print(f'Avg latency: {np.mean(latencies)} seconds')`


			`if __name__ == '__main__':`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`parser = argparse.ArgumentParser(`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`description='Benchmark the latency of processing a single batch of '`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`'requests till completion.')`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`parser.add_argument('--model', type=str, default='facebook/opt-125m')`
[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00			`parser.add_argument('--tokenizer', type=str, default=None)`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--quantization',`
			`'-q',`
Support SqueezeLLM (#1326) Co-authored-by: squeeze-ai-lab <squeezeailab.bair@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2023-10-22 03:14:59 -03:00			`choices=['awq', 'squeezellm', None],`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`default=None)`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`parser.add_argument('--input-len', type=int, default=32)`
			`parser.add_argument('--output-len', type=int, default=128)`
			`parser.add_argument('--batch-size', type=int, default=8)`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--n',`
			`type=int,`
			`default=1,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`help='Number of generated sequences per prompt.')`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`parser.add_argument('--use-beam-search', action='store_true')`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--num-iters',`
			`type=int,`
			`default=3,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`help='Number of iterations to run.')`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--trust-remote-code',`
			`action='store_true',`
fix: enable trust-remote-code in api server & benchmark. (#509) 2023-07-20 08:06:15 +08:00			`help='trust remote code from huggingface')`
Added `dtype` arg to benchmarks (#1228) 2023-10-01 00:04:03 -04:00			`parser.add_argument(`
			`'--dtype',`
			`type=str,`
			`default='auto',`
			`choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],`
			`help='data type for model weights and activations. '`
			`'The "auto" option will use FP16 precision '`
			`'for FP32 and FP16 models, and BF16 precision '`
			`'for BF16 models.')`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`args = parser.parse_args()`
			`main(args)`