[Benchmarks] Make detokenization optional in benchmark scripts (#11697)
Signed-off-by: Jeremy Arnold <Jeremy.Arnold@amd.com>
This commit is contained in:
parent
f7ebad2307
commit
58abe35455
@ -52,6 +52,7 @@ def main(args: argparse.Namespace):
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=args.output_len,
|
max_tokens=args.output_len,
|
||||||
|
detokenize=not args.disable_detokenize,
|
||||||
)
|
)
|
||||||
print(sampling_params)
|
print(sampling_params)
|
||||||
dummy_prompt_token_ids = np.random.randint(10000,
|
dummy_prompt_token_ids = np.random.randint(10000,
|
||||||
@ -173,6 +174,12 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Path to save the latency results in JSON format.",
|
help="Path to save the latency results in JSON format.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-detokenize",
|
||||||
|
action="store_true",
|
||||||
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"),
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -194,7 +194,9 @@ def main(args):
|
|||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
sampling_params = SamplingParams(temperature=0,
|
||||||
|
max_tokens=args.output_len,
|
||||||
|
detokenize=not args.disable_detokenize)
|
||||||
|
|
||||||
print("Testing filtered requests")
|
print("Testing filtered requests")
|
||||||
prompts = repeat_and_sort_requests(filtered_requests,
|
prompts = repeat_and_sort_requests(filtered_requests,
|
||||||
@ -243,6 +245,12 @@ if __name__ == "__main__":
|
|||||||
"subtract this length when filtering prompts. Only used "
|
"subtract this length when filtering prompts. Only used "
|
||||||
"when dataset-path is not provided.",
|
"when dataset-path is not provided.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--disable-detokenize',
|
||||||
|
action='store_true',
|
||||||
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"),
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -23,7 +23,7 @@ def sample_requests(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
) -> list[tuple[str, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
@ -71,6 +71,7 @@ def run_vllm(
|
|||||||
requests: list[tuple[str, int, int]],
|
requests: list[tuple[str, int, int]],
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: EngineArgs,
|
engine_args: EngineArgs,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
@ -95,6 +96,7 @@ def run_vllm(
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
))
|
))
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
@ -121,7 +123,8 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.n,
|
elapsed_time = run_vllm(requests, args.n,
|
||||||
EngineArgs.from_cli_args(args))
|
EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(prompt_len + output_len
|
total_num_tokens = sum(prompt_len + output_len
|
||||||
@ -174,6 +177,12 @@ if __name__ == "__main__":
|
|||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--disable-detokenize',
|
||||||
|
action='store_true',
|
||||||
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"),
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -168,6 +168,7 @@ def run_vllm(
|
|||||||
requests: list[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: EngineArgs,
|
engine_args: EngineArgs,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
@ -194,6 +195,7 @@ def run_vllm(
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
))
|
))
|
||||||
lora_requests: Optional[list[LoRARequest]] = None
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
if engine_args.enable_lora:
|
if engine_args.enable_lora:
|
||||||
@ -232,6 +234,7 @@ async def run_vllm_async(
|
|||||||
n: int,
|
n: int,
|
||||||
engine_args: AsyncEngineArgs,
|
engine_args: AsyncEngineArgs,
|
||||||
disable_frontend_multiprocessing: bool = False,
|
disable_frontend_multiprocessing: bool = False,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
@ -262,6 +265,7 @@ async def run_vllm_async(
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
))
|
))
|
||||||
lora_requests.append(request.lora_request)
|
lora_requests.append(request.lora_request)
|
||||||
|
|
||||||
@ -288,6 +292,7 @@ def run_hf(
|
|||||||
n: int,
|
n: int,
|
||||||
max_batch_size: int,
|
max_batch_size: int,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
llm = AutoModelForCausalLM.from_pretrained(
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||||
@ -327,8 +332,9 @@ def run_hf(
|
|||||||
use_cache=True,
|
use_cache=True,
|
||||||
max_new_tokens=max_output_len,
|
max_new_tokens=max_output_len,
|
||||||
)
|
)
|
||||||
# Include the decoding time.
|
if not disable_detokenize:
|
||||||
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
|
# Include the decoding time.
|
||||||
|
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
|
||||||
pbar.update(len(batch))
|
pbar.update(len(batch))
|
||||||
|
|
||||||
# Clear the batch.
|
# Clear the batch.
|
||||||
@ -440,14 +446,17 @@ def main(args: argparse.Namespace):
|
|||||||
args.n,
|
args.n,
|
||||||
AsyncEngineArgs.from_cli_args(args),
|
AsyncEngineArgs.from_cli_args(args),
|
||||||
args.disable_frontend_multiprocessing,
|
args.disable_frontend_multiprocessing,
|
||||||
|
args.disable_detokenize,
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
elapsed_time = run_vllm(requests, args.n,
|
elapsed_time = run_vllm(requests, args.n,
|
||||||
EngineArgs.from_cli_args(args))
|
EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
args.hf_max_batch_size, args.trust_remote_code)
|
args.hf_max_batch_size, args.trust_remote_code,
|
||||||
|
args.disable_detokenize)
|
||||||
elif args.backend == "mii":
|
elif args.backend == "mii":
|
||||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
||||||
args.output_len)
|
args.output_len)
|
||||||
@ -526,6 +535,11 @@ if __name__ == "__main__":
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
help="Disable decoupled async engine frontend.")
|
help="Disable decoupled async engine frontend.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-detokenize",
|
||||||
|
action="store_true",
|
||||||
|
help=("Do not detokenize the response (i.e. do not include "
|
||||||
|
"detokenization time in the measurement)"))
|
||||||
# LoRA
|
# LoRA
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lora-path",
|
"--lora-path",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user