enable --gpu-memory-utilization in benchmark_throughput.py (#3175)

Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
2024-03-05 02:37:58 +08:00 · 2024-03-05 02:37:58 +08:00 · 9cbc7e5f3b
commit 9cbc7e5f3b
parent 27a7b070db
1 changed files with 14 additions and 7 deletions
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -74,6 +74,7 @@ def run_vllm(
    kv_cache_dtype: str,
    device: str,
    enable_prefix_caching: bool,
+    gpu_memory_utilization: float = 0.9,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(model=model,
@ -84,6 +85,7 @@ def run_vllm(
              trust_remote_code=trust_remote_code,
              dtype=dtype,
              max_model_len=max_model_len,
+              gpu_memory_utilization=gpu_memory_utilization,
              enforce_eager=enforce_eager,
              kv_cache_dtype=kv_cache_dtype,
              device=device,
@ -206,13 +208,12 @@ def main(args: argparse.Namespace):
                                   args.output_len)

    if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
-                                args.quantization, args.tensor_parallel_size,
-                                args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code, args.dtype,
-                                args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device,
-                                args.enable_prefix_caching)
+        elapsed_time = run_vllm(
+            requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype, args.device,
+            args.enable_prefix_caching, args.gpu_memory_utilization)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -287,6 +288,12 @@ if __name__ == "__main__":
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
    parser.add_argument("--enforce-eager",
                        action="store_true",
                        help="enforce eager execution")