[Speculative decoding] Add serving benchmark for llama3 70b + speculative decoding (#6964)
This commit is contained in:
parent
fb4f530bf5
commit
c32ab8be1a
@ -55,5 +55,26 @@
|
|||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
||||||
|
"qps_list": [2],
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
|
"num_speculative_tokens": 4,
|
||||||
|
"speculative_draft_tensor_parallel_size": 1,
|
||||||
|
"use_v2_block_manager": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
Loading…
x
Reference in New Issue
Block a user