[Misc] Add args for selecting distributed executor to benchmarks (#5335)
This commit is contained in:
parent
e69ded7d1c
commit
b3376e5c76
@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
download_dir=args.download_dir,
|
||||
block_size=args.block_size,
|
||||
gpu_memory_utilization=args.gpu_memory_utilization)
|
||||
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||
distributed_executor_backend=args.distributed_executor_backend)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
n=args.n,
|
||||
@ -221,5 +222,12 @@ if __name__ == '__main__':
|
||||
help='the fraction of GPU memory to be used for '
|
||||
'the model executor, which can range from 0 to 1.'
|
||||
'If unspecified, will use the default value of 0.9.')
|
||||
parser.add_argument(
|
||||
'--distributed-executor-backend',
|
||||
choices=['ray', 'mp'],
|
||||
default=None,
|
||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||
'is used, will be automatically set to "ray" if installed '
|
||||
'or "mp" (multiprocessing) otherwise.')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
@ -78,6 +78,7 @@ def run_vllm(
|
||||
enable_prefix_caching: bool,
|
||||
enable_chunked_prefill: bool,
|
||||
max_num_batched_tokens: int,
|
||||
distributed_executor_backend: Optional[str],
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
download_dir: Optional[str] = None,
|
||||
) -> float:
|
||||
@ -100,6 +101,7 @@ def run_vllm(
|
||||
download_dir=download_dir,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
# Add the requests to the engine.
|
||||
@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
|
||||
args.enforce_eager, args.kv_cache_dtype,
|
||||
args.quantization_param_path, args.device,
|
||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||
args.max_num_batched_tokens, args.gpu_memory_utilization,
|
||||
args.download_dir)
|
||||
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||
args.gpu_memory_utilization, args.download_dir)
|
||||
elif args.backend == "hf":
|
||||
assert args.tensor_parallel_size == 1
|
||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||
@ -368,6 +370,13 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to save the throughput results in JSON format.')
|
||||
parser.add_argument(
|
||||
'--distributed-executor-backend',
|
||||
choices=['ray', 'mp'],
|
||||
default=None,
|
||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||
'is used, will be automatically set to "ray" if installed '
|
||||
'or "mp" (multiprocessing) otherwise.')
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
|
Loading…
x
Reference in New Issue
Block a user