[CI] Fix crashes of performance benchmark (#7500)

This commit is contained in:
Kuntai Du 2024-08-16 08:08:45 -07:00 committed by GitHub
parent 9587b050fb
commit 6fc5b0f249
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 21 additions and 36 deletions

View File

@ -70,23 +70,13 @@ wait_for_server() {
kill_gpu_processes() { kill_gpu_processes() {
# kill all processes on GPU. # kill all processes on GPU.
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
if [ -z "$pids" ]; then
echo "No GPU processes found."
else
for pid in $pids; do
kill -9 "$pid"
echo "Killed process with PID: $pid"
done
echo "All GPU processes have been killed." ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
fi ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
# waiting for GPU processes to be fully killed # wait until GPU memory usage smaller than 1GB
# loop while nvidia-smi returns any processes while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
sleep 1 sleep 1
echo "Waiting for GPU processes to be killed"
done done
# remove vllm config file # remove vllm config file

View File

@ -1,47 +1,42 @@
## Latency tests ## Latency tests
This test suite aims to test vllm's end-to-end latency under a controlled setup.
- Input length: 32 tokens. - Input length: 32 tokens.
- Output length: 128 tokens. - Output length: 128 tokens.
- Batch size: fixed (8). - Batch size: fixed (8).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99). - Evaluation metrics: end-to-end latency (mean, median, p99).
### Latency benchmarking results
{latency_tests_markdown_table} {latency_tests_markdown_table}
## Throughput tests
This test suite aims to test vllm's throughput. ## Throughput tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts. - Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput. - Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput. - Evaluation metrics: throughput.
### Throughput benchmarking results
{throughput_tests_markdown_table} {throughput_tests_markdown_table}
## Serving tests
This test suite aims to test vllm's real serving metrics. ## Serving tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts. - Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests. - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B, under QPS 2
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
### Serving benchmarking results
{serving_tests_markdown_table} {serving_tests_markdown_table}
## json version of the benchmarking tables ## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format. This section contains the data of the markdown tables above in JSON format.

View File

@ -2,7 +2,7 @@
{ {
"test_name": "latency_llama8B_tp1", "test_name": "latency_llama8B_tp1",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"load_format": "dummy", "load_format": "dummy",
"num_iters_warmup": 5, "num_iters_warmup": 5,
@ -12,7 +12,7 @@
{ {
"test_name": "latency_llama70B_tp4", "test_name": "latency_llama70B_tp4",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"load_format": "dummy", "load_format": "dummy",
"num-iters-warmup": 5, "num-iters-warmup": 5,

View File

@ -3,7 +3,7 @@
"test_name": "serving_llama8B_tp1_sharegpt", "test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16, "swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
@ -11,7 +11,7 @@
"load_format": "dummy" "load_format": "dummy"
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -22,7 +22,7 @@
"test_name": "serving_llama70B_tp4_sharegpt", "test_name": "serving_llama70B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
@ -30,7 +30,7 @@
"load_format": "dummy" "load_format": "dummy"
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -60,7 +60,7 @@
"test_name": "serving_llama70B_tp4_sharegpt_specdecode", "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2], "qps_list": [2],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "", "disable_log_requests": "",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
@ -70,7 +70,7 @@
"use_v2_block_manager": "" "use_v2_block_manager": ""
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

View File

@ -2,7 +2,7 @@
{ {
"test_name": "throughput_llama8B_tp1", "test_name": "throughput_llama8B_tp1",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"load_format": "dummy", "load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -13,7 +13,7 @@
{ {
"test_name": "throughput_llama70B_tp4", "test_name": "throughput_llama70B_tp4",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"load_format": "dummy", "load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",