146 lines
3.7 KiB
Bash
146 lines
3.7 KiB
Bash
#!/bin/bash
|
|
|
|
# benchmark the overhead of disaggregated prefill.
|
|
# methodology:
|
|
# - send all request to prefill vLLM instance. It will buffer KV cache.
|
|
# - then send all request to decode instance.
|
|
# - The TTFT of decode instance is the overhead.
|
|
|
|
set -ex
|
|
|
|
kill_gpu_processes() {
|
|
# kill all processes on GPU.
|
|
pgrep pt_main_thread | xargs -r kill -9
|
|
pgrep python3 | xargs -r kill -9
|
|
sleep 10
|
|
|
|
# remove vllm config file
|
|
rm -rf ~/.config/vllm
|
|
|
|
# Print the GPU memory usage
|
|
# so that we know if all GPU processes are killed.
|
|
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
|
# The memory usage should be 0 MB.
|
|
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
|
}
|
|
|
|
wait_for_server() {
|
|
# wait for vllm server to start
|
|
# return 1 if vllm server crashes
|
|
local port=$1
|
|
timeout 1200 bash -c "
|
|
until curl -s localhost:${port}/v1/completions > /dev/null; do
|
|
sleep 1
|
|
done" && return 0 || return 1
|
|
}
|
|
|
|
|
|
benchmark() {
|
|
|
|
export VLLM_LOGGING_LEVEL=DEBUG
|
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
|
|
|
# compare chunked prefill with disaggregated prefill
|
|
|
|
results_folder="./results"
|
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
dataset_name="sonnet"
|
|
dataset_path="../sonnet_4x.txt"
|
|
num_prompts=10
|
|
qps=$1
|
|
prefix_len=50
|
|
input_len=2048
|
|
output_len=$2
|
|
|
|
|
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
|
-m vllm.entrypoints.openai.api_server \
|
|
--model $model \
|
|
--port 8100 \
|
|
--max-model-len 10000 \
|
|
--gpu-memory-utilization 0.6 \
|
|
--kv-transfer-config \
|
|
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
|
|
|
|
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
|
-m vllm.entrypoints.openai.api_server \
|
|
--model $model \
|
|
--port 8200 \
|
|
--max-model-len 10000 \
|
|
--gpu-memory-utilization 0.6 \
|
|
--kv-transfer-config \
|
|
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
|
|
|
wait_for_server 8100
|
|
wait_for_server 8200
|
|
|
|
# let the prefill instance finish prefill
|
|
python3 ../benchmark_serving.py \
|
|
--backend vllm \
|
|
--model $model \
|
|
--dataset-name $dataset_name \
|
|
--dataset-path $dataset_path \
|
|
--sonnet-input-len $input_len \
|
|
--sonnet-output-len "$output_len" \
|
|
--sonnet-prefix-len $prefix_len \
|
|
--num-prompts $num_prompts \
|
|
--port 8100 \
|
|
--save-result \
|
|
--result-dir $results_folder \
|
|
--result-filename disagg_prefill_tp1.json \
|
|
--request-rate "inf"
|
|
|
|
|
|
# send the request to decode.
|
|
# The TTFT of this command will be the overhead of disagg prefill impl.
|
|
python3 ../benchmark_serving.py \
|
|
--backend vllm \
|
|
--model $model \
|
|
--dataset-name $dataset_name \
|
|
--dataset-path $dataset_path \
|
|
--sonnet-input-len $input_len \
|
|
--sonnet-output-len "$output_len" \
|
|
--sonnet-prefix-len $prefix_len \
|
|
--num-prompts $num_prompts \
|
|
--port 8200 \
|
|
--save-result \
|
|
--result-dir $results_folder \
|
|
--result-filename disagg_prefill_tp1_overhead.json \
|
|
--request-rate "$qps"
|
|
kill_gpu_processes
|
|
|
|
}
|
|
|
|
|
|
main() {
|
|
|
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
(which jq) || (apt-get -y install jq)
|
|
(which socat) || (apt-get -y install socat)
|
|
|
|
pip install quart httpx datasets
|
|
|
|
cd "$(dirname "$0")"
|
|
|
|
cd ..
|
|
# create sonnet-4x.txt
|
|
echo "" > sonnet_4x.txt
|
|
for _ in {1..4}
|
|
do
|
|
cat sonnet.txt >> sonnet_4x.txt
|
|
done
|
|
cd disagg_benchmarks
|
|
|
|
rm -rf results
|
|
mkdir results
|
|
|
|
default_qps=1
|
|
default_output_len=1
|
|
benchmark $default_qps $default_output_len
|
|
|
|
}
|
|
|
|
|
|
main "$@"
|