benchmark_serving support --served-model-name param (#12109)
Signed-off-by: zibai <zibai.gj@alibaba-inc.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
parent
e66faf4809
commit
936db119ed
@ -22,6 +22,7 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
|
model_name: Optional[str] = None
|
||||||
best_of: int = 1
|
best_of: int = 1
|
||||||
logprobs: Optional[int] = None
|
logprobs: Optional[int] = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
@ -78,7 +79,7 @@ async def async_request_tgi(
|
|||||||
continue
|
continue
|
||||||
chunk_bytes = chunk_bytes.decode("utf-8")
|
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||||
|
|
||||||
#NOTE: Sometimes TGI returns a ping response without
|
# NOTE: Sometimes TGI returns a ping response without
|
||||||
# any data, we should skip it.
|
# any data, we should skip it.
|
||||||
if chunk_bytes.startswith(":"):
|
if chunk_bytes.startswith(":"):
|
||||||
continue
|
continue
|
||||||
@ -235,7 +236,8 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"best_of": request_func_input.best_of,
|
"best_of": request_func_input.best_of,
|
||||||
@ -328,7 +330,8 @@ async def async_request_openai_chat_completions(
|
|||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
content.append(request_func_input.multi_modal_content)
|
content.append(request_func_input.multi_modal_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
@ -525,6 +525,7 @@ async def benchmark(
|
|||||||
api_url: str,
|
api_url: str,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
|
model_name: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
logprobs: Optional[int],
|
logprobs: Optional[int],
|
||||||
@ -553,6 +554,7 @@ async def benchmark(
|
|||||||
"Multi-modal content is only supported on 'openai-chat' backend.")
|
"Multi-modal content is only supported on 'openai-chat' backend.")
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
|
model_name=model_name,
|
||||||
prompt=test_prompt,
|
prompt=test_prompt,
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
@ -573,6 +575,7 @@ async def benchmark(
|
|||||||
if profile:
|
if profile:
|
||||||
print("Starting profiler...")
|
print("Starting profiler...")
|
||||||
profile_input = RequestFuncInput(model=model_id,
|
profile_input = RequestFuncInput(model=model_id,
|
||||||
|
model_name=model_name,
|
||||||
prompt=test_prompt,
|
prompt=test_prompt,
|
||||||
api_url=base_url + "/start_profile",
|
api_url=base_url + "/start_profile",
|
||||||
prompt_len=test_prompt_len,
|
prompt_len=test_prompt_len,
|
||||||
@ -616,6 +619,7 @@ async def benchmark(
|
|||||||
async for request in get_request(input_requests, request_rate, burstiness):
|
async for request in get_request(input_requests, request_rate, burstiness):
|
||||||
prompt, prompt_len, output_len, mm_content = request
|
prompt, prompt_len, output_len, mm_content = request
|
||||||
request_func_input = RequestFuncInput(model=model_id,
|
request_func_input = RequestFuncInput(model=model_id,
|
||||||
|
model_name=model_name,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
@ -780,6 +784,7 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
backend = args.backend
|
backend = args.backend
|
||||||
model_id = args.model
|
model_id = args.model
|
||||||
|
model_name = args.served_model_name
|
||||||
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
||||||
tokenizer_mode = args.tokenizer_mode
|
tokenizer_mode = args.tokenizer_mode
|
||||||
|
|
||||||
@ -877,6 +882,7 @@ def main(args: argparse.Namespace):
|
|||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
|
model_name=model_name,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
input_requests=input_requests,
|
input_requests=input_requests,
|
||||||
logprobs=args.logprobs,
|
logprobs=args.logprobs,
|
||||||
@ -1222,5 +1228,12 @@ if __name__ == "__main__":
|
|||||||
'always use the slow tokenizer. \n* '
|
'always use the slow tokenizer. \n* '
|
||||||
'"mistral" will always use the `mistral_common` tokenizer.')
|
'"mistral" will always use the `mistral_common` tokenizer.')
|
||||||
|
|
||||||
|
parser.add_argument("--served-model-name",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="The model name used in the API. "
|
||||||
|
"If not specified, the model name will be the "
|
||||||
|
"same as the ``--model`` argument. ")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user