[Speculative decoding] Add serving benchmark for llama3 70b + speculative decoding (#6964)

This commit is contained in:
Cade Daniel 2024-07-30 17:53:21 -07:00 committed by GitHub
parent fb4f530bf5
commit c32ab8be1a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -55,5 +55,26 @@
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200 "num_prompts": 200
} }
},
{
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1,
"use_v2_block_manager": ""
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
} }
] ]