[Misc] Add args for selecting distributed executor to benchmarks (#5335)

This commit is contained in:
Benjamin Kitor 2024-06-07 18:20:16 -07:00 committed by GitHub
parent e69ded7d1c
commit b3376e5c76
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 3 deletions

View File

@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
enable_chunked_prefill=args.enable_chunked_prefill, enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir, download_dir=args.download_dir,
block_size=args.block_size, block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization) gpu_memory_utilization=args.gpu_memory_utilization,
distributed_executor_backend=args.distributed_executor_backend)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=args.n, n=args.n,
@ -221,5 +222,12 @@ if __name__ == '__main__':
help='the fraction of GPU memory to be used for ' help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.' 'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.') 'If unspecified, will use the default value of 0.9.')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -78,6 +78,7 @@ def run_vllm(
enable_prefix_caching: bool, enable_prefix_caching: bool,
enable_chunked_prefill: bool, enable_chunked_prefill: bool,
max_num_batched_tokens: int, max_num_batched_tokens: int,
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9, gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None, download_dir: Optional[str] = None,
) -> float: ) -> float:
@ -100,6 +101,7 @@ def run_vllm(
download_dir=download_dir, download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) )
# Add the requests to the engine. # Add the requests to the engine.
@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
args.enforce_eager, args.kv_cache_dtype, args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device, args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill, args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.gpu_memory_utilization, args.max_num_batched_tokens, args.distributed_executor_backend,
args.download_dir) args.gpu_memory_utilization, args.download_dir)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -368,6 +370,13 @@ if __name__ == "__main__":
type=str, type=str,
default=None, default=None,
help='Path to save the throughput results in JSON format.') help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.')
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model