From 58abe35455685a0dd6f1484614c663eff858aa44 Mon Sep 17 00:00:00 2001 From: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com> Date: Fri, 7 Mar 2025 10:09:00 -0600 Subject: [PATCH] [Benchmarks] Make detokenization optional in benchmark scripts (#11697) Signed-off-by: Jeremy Arnold --- benchmarks/benchmark_latency.py | 7 +++++++ benchmarks/benchmark_prefix_caching.py | 10 +++++++++- benchmarks/benchmark_prioritization.py | 13 +++++++++++-- benchmarks/benchmark_throughput.py | 22 ++++++++++++++++++---- 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d7f39f50..dfd9bb1e 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -52,6 +52,7 @@ def main(args: argparse.Namespace): top_p=1.0, ignore_eos=True, max_tokens=args.output_len, + detokenize=not args.disable_detokenize, ) print(sampling_params) dummy_prompt_token_ids = np.random.randint(10000, @@ -173,6 +174,12 @@ if __name__ == "__main__": default=None, help="Path to save the latency results in JSON format.", ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index fba32520..4fff7a8f 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -194,7 +194,9 @@ def main(args): llm = LLM(**dataclasses.asdict(engine_args)) - sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + sampling_params = SamplingParams(temperature=0, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize) print("Testing filtered requests") prompts = repeat_and_sort_requests(filtered_requests, @@ -243,6 +245,12 @@ if __name__ == "__main__": "subtract this length when filtering prompts. Only used " "when dataset-path is not provided.", ) + parser.add_argument( + '--disable-detokenize', + action='store_true', + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 43b2c1b0..76fe00ed 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -23,7 +23,7 @@ def sample_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], -) -> list[tuple[str, int, int]]: +) -> list[tuple[str, int, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -71,6 +71,7 @@ def run_vllm( requests: list[tuple[str, int, int]], n: int, engine_args: EngineArgs, + disable_detokenize: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) @@ -95,6 +96,7 @@ def run_vllm( top_p=1.0, ignore_eos=True, max_tokens=output_len, + detokenize=not disable_detokenize, )) start = time.perf_counter() @@ -121,7 +123,8 @@ def main(args: argparse.Namespace): if args.backend == "vllm": elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args)) + EngineArgs.from_cli_args(args), + args.disable_detokenize) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -174,6 +177,12 @@ if __name__ == "__main__": type=str, default=None, help='Path to save the throughput results in JSON format.') + parser.add_argument( + '--disable-detokenize', + action='store_true', + help=("Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)"), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index d8353cf1..4ab82447 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -168,6 +168,7 @@ def run_vllm( requests: list[SampleRequest], n: int, engine_args: EngineArgs, + disable_detokenize: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) @@ -194,6 +195,7 @@ def run_vllm( top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, )) lora_requests: Optional[list[LoRARequest]] = None if engine_args.enable_lora: @@ -232,6 +234,7 @@ async def run_vllm_async( n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, + disable_detokenize: bool = False, ) -> float: from vllm import SamplingParams @@ -262,6 +265,7 @@ async def run_vllm_async( top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, )) lora_requests.append(request.lora_request) @@ -288,6 +292,7 @@ def run_hf( n: int, max_batch_size: int, trust_remote_code: bool, + disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) @@ -327,8 +332,9 @@ def run_hf( use_cache=True, max_new_tokens=max_output_len, ) - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + if not disable_detokenize: + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) pbar.update(len(batch)) # Clear the batch. @@ -440,14 +446,17 @@ def main(args: argparse.Namespace): args.n, AsyncEngineArgs.from_cli_args(args), args.disable_frontend_multiprocessing, + args.disable_detokenize, )) else: elapsed_time = run_vllm(requests, args.n, - EngineArgs.from_cli_args(args)) + EngineArgs.from_cli_args(args), + args.disable_detokenize) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.hf_max_batch_size, args.trust_remote_code) + args.hf_max_batch_size, args.trust_remote_code, + args.disable_detokenize) elif args.backend == "mii": elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, args.output_len) @@ -526,6 +535,11 @@ if __name__ == "__main__": action='store_true', default=False, help="Disable decoupled async engine frontend.") + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize the response (i.e. do not include " + "detokenization time in the measurement)")) # LoRA parser.add_argument( "--lora-path",