vllm/benchmarks/benchmark_latency.py

"""Benchmark the latency of processing a single batch of requests."""
import argparse
import time
from pathlib import Path
from typing import Optional

import numpy as np
import torch
from tqdm import tqdm

from vllm import LLM, SamplingParams


def main(args: argparse.Namespace):
    print(args)

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(
        model=args.model,
        tokenizer=args.tokenizer,
        quantization=args.quantization,
        tensor_parallel_size=args.tensor_parallel_size,
        trust_remote_code=args.trust_remote_code,
        dtype=args.dtype,
        enforce_eager=args.enforce_eager,
        kv_cache_dtype=args.kv_cache_dtype,
        device=args.device,
    )

    sampling_params = SamplingParams(
        n=args.n,
        temperature=0.0 if args.use_beam_search else 1.0,
        top_p=1.0,
        use_beam_search=args.use_beam_search,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
    print(sampling_params)
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()

    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            with torch.profiler.profile(
                    activities=[
                        torch.profiler.ProfilerActivity.CPU,
                        torch.profiler.ProfilerActivity.CUDA,
                    ],
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        str(profile_dir))) as p:
                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
                             sampling_params=sampling_params,
                             use_tqdm=False)
            print(p.key_averages())
        else:
            start_time = time.perf_counter()
            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
                         sampling_params=sampling_params,
                         use_tqdm=False)
            end_time = time.perf_counter()
            latency = end_time - start_time
            return latency

    print("Warming up...")
    run_to_completion(profile_dir=None)

    if args.profile:
        profile_dir = args.profile_result_dir
        if not profile_dir:
            profile_dir = Path(
                "."
            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
        print(f"Profiling (results will be saved to '{profile_dir}')...")
        run_to_completion(profile_dir=profile_dir)
        return

    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    print(f'Avg latency: {np.mean(latencies)} seconds')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
                        choices=['awq', 'gptq', 'squeezellm', None],
                        default=None)
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
    parser.add_argument('--n',
                        type=int,
                        default=1,
                        help='Number of generated sequences per prompt.')
    parser.add_argument('--use-beam-search', action='store_true')
    parser.add_argument('--num-iters',
                        type=int,
                        default=3,
                        help='Number of iterations to run.')
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
    parser.add_argument(
        '--dtype',
        type=str,
        default='auto',
        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
        help='data type for model weights and activations. '
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
    parser.add_argument('--enforce-eager',
                        action='store_true',
                        help='enforce eager mode and disable CUDA graph')
    parser.add_argument(
        "--kv-cache-dtype",
        type=str,
        choices=['auto', 'fp8_e5m2'],
        default='auto',
        help=
        'Data type for kv cache storage. If "auto", will use model data type.')
    parser.add_argument(
        '--profile',
        action='store_true',
        help='profile the generation process of a single batch')
    parser.add_argument(
        '--profile-result-dir',
        type=str,
        default=None,
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
        choices=["cuda"],
        help='device type for vLLM execution, supporting CUDA only currently.')
    args = parser.parse_args()
    main(args)
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`"""Benchmark the latency of processing a single batch of requests."""`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`import argparse`
			`import time`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`from pathlib import Path`
			`from typing import Optional`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
			`import numpy as np`
			`import torch`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`from tqdm import tqdm`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00			`from vllm import LLM, SamplingParams`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00

			`def main(args: argparse.Namespace):`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print(args)`

			`# NOTE(woosuk): If the request cannot be processed in a single batch,`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`# the engine will automatically process the request in multiple batches.`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`llm = LLM(`
			`model=args.model,`
[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00			`tokenizer=args.tokenizer,`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`quantization=args.quantization,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`tensor_parallel_size=args.tensor_parallel_size,`
fix: enable trust-remote-code in api server & benchmark. (#509) 2023-07-20 08:06:15 +08:00			`trust_remote_code=args.trust_remote_code,`
Added `dtype` arg to benchmarks (#1228) 2023-10-01 00:04:03 -04:00			`dtype=args.dtype,`
Optimize model execution with CUDA graph (#1926) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> 2023-12-16 21:12:08 -08:00			`enforce_eager=args.enforce_eager,`
Support FP8-E5M2 KV Cache (#2279) Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-01-29 08:43:54 +08:00			`kv_cache_dtype=args.kv_cache_dtype,`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`device=args.device,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`sampling_params = SamplingParams(`
			`n=args.n,`
			`temperature=0.0 if args.use_beam_search else 1.0,`
			`top_p=1.0,`
			`use_beam_search=args.use_beam_search,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`ignore_eos=True,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`max_tokens=args.output_len,`
			`)`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`print(sampling_params)`
[Minor] Fix benchmark_latency script (#2765) 2024-02-05 12:45:37 -08:00			`dummy_prompt_token_ids = np.random.randint(10000,`
			`size=(args.batch_size,`
			`args.input_len))`
			`dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`def run_to_completion(profile_dir: Optional[str] = None):`
			`if profile_dir:`
			`with torch.profiler.profile(`
			`activities=[`
			`torch.profiler.ProfilerActivity.CPU,`
			`torch.profiler.ProfilerActivity.CUDA,`
			`],`
			`on_trace_ready=torch.profiler.tensorboard_trace_handler(`
			`str(profile_dir))) as p:`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`llm.generate(prompt_token_ids=dummy_prompt_token_ids,`
			`sampling_params=sampling_params,`
			`use_tqdm=False)`
			`print(p.key_averages())`
			`else:`
			`start_time = time.perf_counter()`
			`llm.generate(prompt_token_ids=dummy_prompt_token_ids,`
			`sampling_params=sampling_params,`
			`use_tqdm=False)`
			`end_time = time.perf_counter()`
			`latency = end_time - start_time`
			`return latency`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print("Warming up...")`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`run_to_completion(profile_dir=None)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`if args.profile:`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`profile_dir = args.profile_result_dir`
			`if not profile_dir:`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`profile_dir = Path(`
			`"."`
			`) / "vllm_benchmark_result" / f"latency_result_{time.time()}"`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`print(f"Profiling (results will be saved to '{profile_dir}')...")`
[Minor] Fix benchmark_latency script (#2765) 2024-02-05 12:45:37 -08:00			`run_to_completion(profile_dir=profile_dir)`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`return`

Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`# Benchmark.`
			`latencies = []`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):`
Fix latency benchmark script (#2035) 2023-12-11 11:19:08 -08:00			`latencies.append(run_to_completion(profile_dir=None))`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`print(f'Avg latency: {np.mean(latencies)} seconds')`


			`if __name__ == '__main__':`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`parser = argparse.ArgumentParser(`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`description='Benchmark the latency of processing a single batch of '`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`'requests till completion.')`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`parser.add_argument('--model', type=str, default='facebook/opt-125m')`
[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00			`parser.add_argument('--tokenizer', type=str, default=None)`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--quantization',`
			`'-q',`
Add GPTQ support (#916) 2023-12-15 19:04:22 +08:00			`choices=['awq', 'gptq', 'squeezellm', None],`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`default=None)`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`parser.add_argument('--input-len', type=int, default=32)`
			`parser.add_argument('--output-len', type=int, default=128)`
			`parser.add_argument('--batch-size', type=int, default=8)`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--n',`
			`type=int,`
			`default=1,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`help='Number of generated sequences per prompt.')`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`parser.add_argument('--use-beam-search', action='store_true')`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--num-iters',`
			`type=int,`
			`default=3,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`help='Number of iterations to run.')`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--trust-remote-code',`
			`action='store_true',`
fix: enable trust-remote-code in api server & benchmark. (#509) 2023-07-20 08:06:15 +08:00			`help='trust remote code from huggingface')`
Added `dtype` arg to benchmarks (#1228) 2023-10-01 00:04:03 -04:00			`parser.add_argument(`
			`'--dtype',`
			`type=str,`
			`default='auto',`
			`choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],`
			`help='data type for model weights and activations. '`
			`'The "auto" option will use FP16 precision '`
			`'for FP32 and FP16 models, and BF16 precision '`
			`'for BF16 models.')`
Optimize model execution with CUDA graph (#1926) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> 2023-12-16 21:12:08 -08:00			`parser.add_argument('--enforce-eager',`
			`action='store_true',`
			`help='enforce eager mode and disable CUDA graph')`
Support FP8-E5M2 KV Cache (#2279) Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-01-29 08:43:54 +08:00			`parser.add_argument(`
			`"--kv-cache-dtype",`
			`type=str,`
			`choices=['auto', 'fp8_e5m2'],`
			`default='auto',`
			`help=`
			`'Data type for kv cache storage. If "auto", will use model data type.')`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`parser.add_argument(`
			`'--profile',`
			`action='store_true',`
			`help='profile the generation process of a single batch')`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`parser.add_argument(`
			`'--profile-result-dir',`
			`type=str,`
			`default=None,`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`help=('path to save the pytorch profiler output. Can be visualized '`
			`'with ui.perfetto.dev or Tensorboard.'))`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`parser.add_argument(`
			`"--device",`
			`type=str,`
			`default="cuda",`
			`choices=["cuda"],`
			`help='device type for vLLM execution, supporting CUDA only currently.')`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`args = parser.parse_args()`
			`main(args)`