vllm/benchmarks/benchmark_latency.py

"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
import json
import time
from pathlib import Path
from typing import List, Optional

import numpy as np
import torch
from tqdm import tqdm

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.utils import FlexibleArgumentParser


def main(args: argparse.Namespace):
    print(args)

    engine_args = EngineArgs.from_cli_args(args)

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(**dataclasses.asdict(engine_args))

    sampling_params = SamplingParams(
        n=args.n,
        temperature=1.0,
        top_p=1.0,
        ignore_eos=True,
        max_tokens=args.output_len,
    )
    print(sampling_params)
    dummy_prompt_token_ids = np.random.randint(10000,
                                               size=(args.batch_size,
                                                     args.input_len))
    dummy_prompts: List[PromptType] = [{
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]

    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            with torch.profiler.profile(
                    activities=[
                        torch.profiler.ProfilerActivity.CPU,
                        torch.profiler.ProfilerActivity.CUDA,
                    ],
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        str(profile_dir))) as p:
                llm.generate(dummy_prompts,
                             sampling_params=sampling_params,
                             use_tqdm=False)
            print(p.key_averages().table(sort_by="self_cuda_time_total"))
        else:
            start_time = time.perf_counter()
            llm.generate(dummy_prompts,
                         sampling_params=sampling_params,
                         use_tqdm=False)
            end_time = time.perf_counter()
            latency = end_time - start_time
            return latency

    print("Warming up...")
    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
        run_to_completion(profile_dir=None)

    if args.profile:
        profile_dir = args.profile_result_dir
        if not profile_dir:
            profile_dir = Path(
                "."
            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
        print(f"Profiling (results will be saved to '{profile_dir}')...")
        run_to_completion(profile_dir=profile_dir)
        return

    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
    percentages = [10, 25, 50, 75, 90, 99]
    percentiles = np.percentile(latencies, percentages)
    print(f'Avg latency: {np.mean(latencies)} seconds')
    for percentage, percentile in zip(percentages, percentiles):
        print(f'{percentage}% percentile latency: {percentile} seconds')

    # Output JSON results if specified
    if args.output_json:
        results = {
            "avg_latency": np.mean(latencies),
            "latencies": latencies.tolist(),
            "percentiles": dict(zip(percentages, percentiles.tolist())),
        }
        with open(args.output_json, "w") as f:
            json.dump(results, f, indent=4)


if __name__ == '__main__':
    parser = FlexibleArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
    parser.add_argument('--batch-size', type=int, default=8)
    parser.add_argument('--n',
                        type=int,
                        default=1,
                        help='Number of generated sequences per prompt.')
    parser.add_argument('--use-beam-search', action='store_true')
    parser.add_argument('--num-iters-warmup',
                        type=int,
                        default=10,
                        help='Number of iterations to run for warmup.')
    parser.add_argument('--num-iters',
                        type=int,
                        default=30,
                        help='Number of iterations to run.')
    parser.add_argument(
        '--profile',
        action='store_true',
        help='profile the generation process of a single batch')
    parser.add_argument(
        '--profile-result-dir',
        type=str,
        default=None,
        help=('path to save the pytorch profiler output. Can be visualized '
              'with ui.perfetto.dev or Tensorboard.'))
    parser.add_argument(
        '--output-json',
        type=str,
        default=None,
        help='Path to save the latency results in JSON format.')

    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`"""Benchmark the latency of processing a single batch of requests."""`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`import argparse`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`import dataclasses`
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00			`import json`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`import time`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`from pathlib import Path`
[Core] Consolidate prompt arguments to LLM engines (#4328) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-05-29 04:29:31 +08:00			`from typing import List, Optional`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
			`import numpy as np`
			`import torch`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`from tqdm import tqdm`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00			`from vllm import LLM, SamplingParams`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`from vllm.engine.arg_utils import EngineArgs`
[Core] rename`PromptInputs` and `inputs` (#8876) 2024-09-27 11:35:15 +08:00			`from vllm.inputs import PromptType`
[Frontend] Add FlexibleArgumentParser to support both underscore and dash in names (#5718) 2024-06-20 19:00:13 -04:00			`from vllm.utils import FlexibleArgumentParser`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00

			`def main(args: argparse.Namespace):`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print(args)`

[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`engine_args = EngineArgs.from_cli_args(args)`

Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`# NOTE(woosuk): If the request cannot be processed in a single batch,`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`# the engine will automatically process the request in multiple batches.`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`llm = LLM(**dataclasses.asdict(engine_args))`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`sampling_params = SamplingParams(`
			`n=args.n,`
[core] remove beam search from the core (#9105) 2024-10-06 22:47:04 -07:00			`temperature=1.0,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`top_p=1.0,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`ignore_eos=True,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`max_tokens=args.output_len,`
			`)`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`print(sampling_params)`
[Minor] Fix benchmark_latency script (#2765) 2024-02-05 12:45:37 -08:00			`dummy_prompt_token_ids = np.random.randint(10000,`
			`size=(args.batch_size,`
			`args.input_len))`
[Core] rename`PromptInputs` and `inputs` (#8876) 2024-09-27 11:35:15 +08:00			`dummy_prompts: List[PromptType] = [{`
[Core] Consolidate prompt arguments to LLM engines (#4328) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-05-29 04:29:31 +08:00			`"prompt_token_ids": batch`
			`} for batch in dummy_prompt_token_ids.tolist()]`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`def run_to_completion(profile_dir: Optional[str] = None):`
			`if profile_dir:`
			`with torch.profiler.profile(`
			`activities=[`
			`torch.profiler.ProfilerActivity.CPU,`
			`torch.profiler.ProfilerActivity.CUDA,`
			`],`
			`on_trace_ready=torch.profiler.tensorboard_trace_handler(`
			`str(profile_dir))) as p:`
[Core] rename`PromptInputs` and `inputs` (#8876) 2024-09-27 11:35:15 +08:00			`llm.generate(dummy_prompts,`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`sampling_params=sampling_params,`
			`use_tqdm=False)`
[Misc] sort torch profiler table by kernel timing (#11813) 2025-01-07 20:57:04 -06:00			`print(p.key_averages().table(sort_by="self_cuda_time_total"))`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`else:`
			`start_time = time.perf_counter()`
[Core] rename`PromptInputs` and `inputs` (#8876) 2024-09-27 11:35:15 +08:00			`llm.generate(dummy_prompts,`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`sampling_params=sampling_params,`
			`use_tqdm=False)`
			`end_time = time.perf_counter()`
			`latency = end_time - start_time`
			`return latency`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print("Warming up...")`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):`
			`run_to_completion(profile_dir=None)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`if args.profile:`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`profile_dir = args.profile_result_dir`
			`if not profile_dir:`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`profile_dir = Path(`
			`"."`
			`) / "vllm_benchmark_result" / f"latency_result_{time.time()}"`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`print(f"Profiling (results will be saved to '{profile_dir}')...")`
[Minor] Fix benchmark_latency script (#2765) 2024-02-05 12:45:37 -08:00			`run_to_completion(profile_dir=profile_dir)`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`return`

Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`# Benchmark.`
			`latencies = []`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):`
Fix latency benchmark script (#2035) 2023-12-11 11:19:08 -08:00			`latencies.append(run_to_completion(profile_dir=None))`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`latencies = np.array(latencies)`
[CI] the readability of benchmarking and prepare for dashboard (#5571) [CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571) 2024-06-17 11:41:08 -07:00			`percentages = [10, 25, 50, 75, 90, 99]`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`percentiles = np.percentile(latencies, percentages)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`print(f'Avg latency: {np.mean(latencies)} seconds')`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`for percentage, percentile in zip(percentages, percentiles):`
			`print(f'{percentage}% percentile latency: {percentile} seconds')`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00			`# Output JSON results if specified`
			`if args.output_json:`
			`results = {`
			`"avg_latency": np.mean(latencies),`
			`"latencies": latencies.tolist(),`
			`"percentiles": dict(zip(percentages, percentiles.tolist())),`
			`}`
			`with open(args.output_json, "w") as f:`
			`json.dump(results, f, indent=4)`

Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
			`if __name__ == '__main__':`
[Frontend] Add FlexibleArgumentParser to support both underscore and dash in names (#5718) 2024-06-20 19:00:13 -04:00			`parser = FlexibleArgumentParser(`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`description='Benchmark the latency of processing a single batch of '`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`'requests till completion.')`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`parser.add_argument('--input-len', type=int, default=32)`
			`parser.add_argument('--output-len', type=int, default=128)`
			`parser.add_argument('--batch-size', type=int, default=8)`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--n',`
			`type=int,`
			`default=1,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`help='Number of generated sequences per prompt.')`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`parser.add_argument('--use-beam-search', action='store_true')`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`parser.add_argument('--num-iters-warmup',`
			`type=int,`
			`default=10,`
			`help='Number of iterations to run for warmup.')`
Implement AWQ quantization support for LLaMA (#1032) Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com> 2023-09-16 00:03:37 -07:00			`parser.add_argument('--num-iters',`
			`type=int,`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`default=30,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`help='Number of iterations to run.')`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`parser.add_argument(`
			`'--profile',`
			`action='store_true',`
			`help='profile the generation process of a single batch')`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`parser.add_argument(`
			`'--profile-result-dir',`
			`type=str,`
			`default=None,`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`help=('path to save the pytorch profiler output. Can be visualized '`
			`'with ui.perfetto.dev or Tensorboard.'))`
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00			`parser.add_argument(`
			`'--output-json',`
			`type=str,`
			`default=None,`
			`help='Path to save the latency results in JSON format.')`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00
			`parser = EngineArgs.add_cli_args(parser)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`args = parser.parse_args()`
			`main(args)`