[Misc] Add args for selecting distributed executor to benchmarks (#5335)
This commit is contained in:
parent
e69ded7d1c
commit
b3376e5c76
@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
|
|||||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||||
download_dir=args.download_dir,
|
download_dir=args.download_dir,
|
||||||
block_size=args.block_size,
|
block_size=args.block_size,
|
||||||
gpu_memory_utilization=args.gpu_memory_utilization)
|
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||||
|
distributed_executor_backend=args.distributed_executor_backend)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@ -221,5 +222,12 @@ if __name__ == '__main__':
|
|||||||
help='the fraction of GPU memory to be used for '
|
help='the fraction of GPU memory to be used for '
|
||||||
'the model executor, which can range from 0 to 1.'
|
'the model executor, which can range from 0 to 1.'
|
||||||
'If unspecified, will use the default value of 0.9.')
|
'If unspecified, will use the default value of 0.9.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
@ -78,6 +78,7 @@ def run_vllm(
|
|||||||
enable_prefix_caching: bool,
|
enable_prefix_caching: bool,
|
||||||
enable_chunked_prefill: bool,
|
enable_chunked_prefill: bool,
|
||||||
max_num_batched_tokens: int,
|
max_num_batched_tokens: int,
|
||||||
|
distributed_executor_backend: Optional[str],
|
||||||
gpu_memory_utilization: float = 0.9,
|
gpu_memory_utilization: float = 0.9,
|
||||||
download_dir: Optional[str] = None,
|
download_dir: Optional[str] = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
@ -100,6 +101,7 @@ def run_vllm(
|
|||||||
download_dir=download_dir,
|
download_dir=download_dir,
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
|
|||||||
args.enforce_eager, args.kv_cache_dtype,
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
args.quantization_param_path, args.device,
|
args.quantization_param_path, args.device,
|
||||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||||
args.max_num_batched_tokens, args.gpu_memory_utilization,
|
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||||
args.download_dir)
|
args.gpu_memory_utilization, args.download_dir)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@ -368,6 +370,13 @@ if __name__ == "__main__":
|
|||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
Loading…
x
Reference in New Issue
Block a user