[Benchmarks] Make detokenization optional in benchmark scripts (#11697)

Signed-off-by: Jeremy Arnold <Jeremy.Arnold@amd.com>
This commit is contained in:
Jeremy Arnold 2025-03-07 10:09:00 -06:00 committed by GitHub
parent f7ebad2307
commit 58abe35455
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 45 additions and 7 deletions

View File

@ -52,6 +52,7 @@ def main(args: argparse.Namespace):
top_p=1.0, top_p=1.0,
ignore_eos=True, ignore_eos=True,
max_tokens=args.output_len, max_tokens=args.output_len,
detokenize=not args.disable_detokenize,
) )
print(sampling_params) print(sampling_params)
dummy_prompt_token_ids = np.random.randint(10000, dummy_prompt_token_ids = np.random.randint(10000,
@ -173,6 +174,12 @@ if __name__ == "__main__":
default=None, default=None,
help="Path to save the latency results in JSON format.", help="Path to save the latency results in JSON format.",
) )
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()

View File

@ -194,7 +194,9 @@ def main(args):
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) sampling_params = SamplingParams(temperature=0,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize)
print("Testing filtered requests") print("Testing filtered requests")
prompts = repeat_and_sort_requests(filtered_requests, prompts = repeat_and_sort_requests(filtered_requests,
@ -243,6 +245,12 @@ if __name__ == "__main__":
"subtract this length when filtering prompts. Only used " "subtract this length when filtering prompts. Only used "
"when dataset-path is not provided.", "when dataset-path is not provided.",
) )
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()

View File

@ -23,7 +23,7 @@ def sample_requests(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int], fixed_output_len: Optional[int],
) -> list[tuple[str, int, int]]: ) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")
@ -71,6 +71,7 @@ def run_vllm(
requests: list[tuple[str, int, int]], requests: list[tuple[str, int, int]],
n: int, n: int,
engine_args: EngineArgs, engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
@ -95,6 +96,7 @@ def run_vllm(
top_p=1.0, top_p=1.0,
ignore_eos=True, ignore_eos=True,
max_tokens=output_len, max_tokens=output_len,
detokenize=not disable_detokenize,
)) ))
start = time.perf_counter() start = time.perf_counter()
@ -121,7 +123,8 @@ def main(args: argparse.Namespace):
if args.backend == "vllm": if args.backend == "vllm":
elapsed_time = run_vllm(requests, args.n, elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args)) EngineArgs.from_cli_args(args),
args.disable_detokenize)
else: else:
raise ValueError(f"Unknown backend: {args.backend}") raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len total_num_tokens = sum(prompt_len + output_len
@ -174,6 +177,12 @@ if __name__ == "__main__":
type=str, type=str,
default=None, default=None,
help='Path to save the throughput results in JSON format.') help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()

View File

@ -168,6 +168,7 @@ def run_vllm(
requests: list[SampleRequest], requests: list[SampleRequest],
n: int, n: int,
engine_args: EngineArgs, engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
@ -194,6 +195,7 @@ def run_vllm(
top_p=1.0, top_p=1.0,
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
)) ))
lora_requests: Optional[list[LoRARequest]] = None lora_requests: Optional[list[LoRARequest]] = None
if engine_args.enable_lora: if engine_args.enable_lora:
@ -232,6 +234,7 @@ async def run_vllm_async(
n: int, n: int,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False, disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float: ) -> float:
from vllm import SamplingParams from vllm import SamplingParams
@ -262,6 +265,7 @@ async def run_vllm_async(
top_p=1.0, top_p=1.0,
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
)) ))
lora_requests.append(request.lora_request) lora_requests.append(request.lora_request)
@ -288,6 +292,7 @@ def run_hf(
n: int, n: int,
max_batch_size: int, max_batch_size: int,
trust_remote_code: bool, trust_remote_code: bool,
disable_detokenize: bool = False,
) -> float: ) -> float:
llm = AutoModelForCausalLM.from_pretrained( llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
@ -327,8 +332,9 @@ def run_hf(
use_cache=True, use_cache=True,
max_new_tokens=max_output_len, max_new_tokens=max_output_len,
) )
# Include the decoding time. if not disable_detokenize:
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) # Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
pbar.update(len(batch)) pbar.update(len(batch))
# Clear the batch. # Clear the batch.
@ -440,14 +446,17 @@ def main(args: argparse.Namespace):
args.n, args.n,
AsyncEngineArgs.from_cli_args(args), AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing, args.disable_frontend_multiprocessing,
args.disable_detokenize,
)) ))
else: else:
elapsed_time = run_vllm(requests, args.n, elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args)) EngineArgs.from_cli_args(args),
args.disable_detokenize)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.hf_max_batch_size, args.trust_remote_code) args.hf_max_batch_size, args.trust_remote_code,
args.disable_detokenize)
elif args.backend == "mii": elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len) args.output_len)
@ -526,6 +535,11 @@ if __name__ == "__main__":
action='store_true', action='store_true',
default=False, default=False,
help="Disable decoupled async engine frontend.") help="Disable decoupled async engine frontend.")
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"))
# LoRA # LoRA
parser.add_argument( parser.add_argument(
"--lora-path", "--lora-path",