benchmark_serving support --served-model-name param (#12109)

Signed-off-by: zibai <zibai.gj@alibaba-inc.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2025-01-19 17:59:56 +08:00 · 2025-01-19 17:59:56 +08:00 · 936db119ed
commit 936db119ed
parent e66faf4809
2 changed files with 19 additions and 3 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -22,6 +22,7 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
+    model_name: Optional[str] = None
    best_of: int = 1
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
@ -78,7 +79,7 @@ async def async_request_tgi(
                            continue
                        chunk_bytes = chunk_bytes.decode("utf-8")

-                        #NOTE: Sometimes TGI returns a ping response without
+                        # NOTE: Sometimes TGI returns a ping response without
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
@ -235,7 +236,8 @@ async def async_request_openai_completions(

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
@ -328,7 +330,8 @@ async def async_request_openai_chat_completions(
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
        payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
            "messages": [
                {
                    "role": "user",
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -525,6 +525,7 @@ async def benchmark(
    api_url: str,
    base_url: str,
    model_id: str,
+    model_name: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
@ -553,6 +554,7 @@ async def benchmark(
            "Multi-modal content is only supported on 'openai-chat' backend.")
    test_input = RequestFuncInput(
        model=model_id,
+        model_name=model_name,
        prompt=test_prompt,
        api_url=api_url,
        prompt_len=test_prompt_len,
@ -573,6 +575,7 @@ async def benchmark(
    if profile:
        print("Starting profiler...")
        profile_input = RequestFuncInput(model=model_id,
+                                         model_name=model_name,
                                         prompt=test_prompt,
                                         api_url=base_url + "/start_profile",
                                         prompt_len=test_prompt_len,
@ -616,6 +619,7 @@ async def benchmark(
    async for request in get_request(input_requests, request_rate, burstiness):
        prompt, prompt_len, output_len, mm_content = request
        request_func_input = RequestFuncInput(model=model_id,
+                                              model_name=model_name,
                                              prompt=prompt,
                                              api_url=api_url,
                                              prompt_len=prompt_len,
@ -780,6 +784,7 @@ def main(args: argparse.Namespace):

    backend = args.backend
    model_id = args.model
+    model_name = args.served_model_name
    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
    tokenizer_mode = args.tokenizer_mode

@ -877,6 +882,7 @@ def main(args: argparse.Namespace):
            api_url=api_url,
            base_url=base_url,
            model_id=model_id,
+            model_name=model_name,
            tokenizer=tokenizer,
            input_requests=input_requests,
            logprobs=args.logprobs,
@ -1222,5 +1228,12 @@ if __name__ == "__main__":
        'always use the slow tokenizer. \n* '
        '"mistral" will always use the `mistral_common` tokenizer.')

+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. "
+                        "If not specified, the model name will be the "
+                        "same as the ``--model`` argument. ")
+
    args = parser.parse_args()
    main(args)