463 lines
13 KiB
Bash
463 lines
13 KiB
Bash
#!/bin/bash
|
|
|
|
set -o pipefail
|
|
set -x
|
|
|
|
check_gpus() {
|
|
# check the number of GPUs and GPU type.
|
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
if [[ $gpu_count -gt 0 ]]; then
|
|
echo "GPU found."
|
|
else
|
|
echo "Need at least 1 GPU to run benchmarking."
|
|
exit 1
|
|
fi
|
|
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
|
echo "GPU type is $gpu_type"
|
|
}
|
|
|
|
check_hf_token() {
|
|
# check if HF_TOKEN is available and valid
|
|
if [[ -z "$HF_TOKEN" ]]; then
|
|
echo "Error: HF_TOKEN is not set."
|
|
exit 1
|
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
|
exit 1
|
|
else
|
|
echo "HF_TOKEN is set and valid."
|
|
fi
|
|
}
|
|
|
|
|
|
upload_to_buildkite() {
|
|
# upload the benchmarking results to buildkite
|
|
|
|
# if the agent binary is not found, skip uploading the results, exit 0
|
|
if [ ! -f /workspace/buildkite-agent ]; then
|
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
return 0
|
|
fi
|
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
}
|
|
|
|
|
|
get_current_llm_serving_engine() {
|
|
|
|
if which lmdeploy >/dev/null; then
|
|
echo "Container: lmdeploy"
|
|
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
|
return
|
|
fi
|
|
|
|
if [ -e /tgi-entrypoint.sh ]; then
|
|
echo "Container: tgi"
|
|
export CURRENT_LLM_SERVING_ENGINE=tgi
|
|
return
|
|
fi
|
|
|
|
if which trtllm-build >/dev/null; then
|
|
echo "Container: tensorrt-llm"
|
|
export CURRENT_LLM_SERVING_ENGINE=trt
|
|
return
|
|
fi
|
|
|
|
if [ -e /sgl-workspace ]; then
|
|
echo "Container: sglang"
|
|
export CURRENT_LLM_SERVING_ENGINE=sglang
|
|
return
|
|
fi
|
|
|
|
if [ -e /vllm-workspace ]; then
|
|
echo "Container: vllm"
|
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
|
|
|
return
|
|
fi
|
|
}
|
|
|
|
json2args() {
|
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
# example:
|
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
local json_string=$1
|
|
local args=$(
|
|
echo "$json_string" | jq -r '
|
|
to_entries |
|
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
join(" ")
|
|
'
|
|
)
|
|
echo "$args"
|
|
}
|
|
|
|
kill_gpu_processes() {
|
|
pkill -f python
|
|
pkill -f python3
|
|
pkill -f tritonserver
|
|
pkill -f pt_main_thread
|
|
pkill -f text-generation
|
|
pkill -f lmdeploy
|
|
|
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
|
sleep 1
|
|
done
|
|
}
|
|
|
|
wait_for_server() {
|
|
# wait for vllm server to start
|
|
# return 1 if vllm server crashes
|
|
timeout 1200 bash -c '
|
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
|
sleep 1
|
|
done' && return 0 || return 1
|
|
}
|
|
|
|
ensure_installed() {
|
|
# Ensure that the given command is installed by apt-get
|
|
local cmd=$1
|
|
if ! which "$cmd" >/dev/null; then
|
|
apt-get update && apt-get install -y "$cmd"
|
|
fi
|
|
}
|
|
|
|
run_serving_tests() {
|
|
# run serving tests using `benchmark_serving.py`
|
|
# $1: a json file specifying serving test cases
|
|
|
|
local serving_test_file
|
|
serving_test_file=$1
|
|
|
|
# Iterate over serving tests
|
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
# get the test name, and append the GPU type back to it.
|
|
test_name=$(echo "$params" | jq -r '.test_name')
|
|
|
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
echo "Skip test case $test_name."
|
|
continue
|
|
fi
|
|
|
|
# prepend the current serving engine to the test name
|
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
|
|
|
# get common parameters
|
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
model=$(echo "$common_params" | jq -r '.model')
|
|
tp=$(echo "$common_params" | jq -r '.tp')
|
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
port=$(echo "$common_params" | jq -r '.port')
|
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
|
|
|
# get client and server arguments
|
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
|
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
|
client_args=$(json2args "$client_params")
|
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
echo "Running over qps list $qps_list"
|
|
|
|
# check if there is enough GPU to run the test
|
|
if [[ $gpu_count -lt $tp ]]; then
|
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
continue
|
|
fi
|
|
|
|
if [[ $reuse_server == "true" ]]; then
|
|
echo "Reuse previous server for test case $test_name"
|
|
else
|
|
kill_gpu_processes
|
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
|
"$server_params" "$common_params"
|
|
fi
|
|
|
|
if wait_for_server; then
|
|
echo ""
|
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
|
else
|
|
echo ""
|
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
|
break
|
|
fi
|
|
|
|
# prepare tokenizer
|
|
# this is required for lmdeploy.
|
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
rm -rf /tokenizer_cache
|
|
mkdir /tokenizer_cache
|
|
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
|
--model "$model" \
|
|
--cachedir /tokenizer_cache
|
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
|
|
|
|
# change model name for lmdeploy (it will not follow standard hf name)
|
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
|
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
|
fi
|
|
|
|
# iterate over different QPS
|
|
for qps in $qps_list; do
|
|
# remove the surrounding single quote from qps
|
|
if [[ "$qps" == *"inf"* ]]; then
|
|
echo "qps was $qps"
|
|
qps="inf"
|
|
echo "now qps is $qps"
|
|
fi
|
|
|
|
new_test_name=$test_name"_qps_"$qps
|
|
|
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
|
|
|
if [[ $backend = "trt" ]]; then
|
|
backend="tensorrt-llm"
|
|
fi
|
|
|
|
if [[ "$backend" == *"vllm"* ]]; then
|
|
backend="vllm"
|
|
fi
|
|
|
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
|
|
|
client_command="python3 benchmark_serving.py \
|
|
--backend $backend \
|
|
--tokenizer /tokenizer_cache \
|
|
--model $model \
|
|
--dataset-name $dataset_name \
|
|
--dataset-path $dataset_path \
|
|
--num-prompts $num_prompts \
|
|
--port $port \
|
|
--save-result \
|
|
--result-dir $RESULTS_FOLDER \
|
|
--result-filename ${new_test_name}.json \
|
|
--request-rate $qps \
|
|
--ignore-eos \
|
|
$client_args"
|
|
|
|
elif [[ "$dataset_name" = "sonnet" ]]; then
|
|
|
|
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
|
|
|
client_command="python3 benchmark_serving.py \
|
|
--backend $backend \
|
|
--tokenizer /tokenizer_cache \
|
|
--model $model \
|
|
--dataset-name $dataset_name \
|
|
--dataset-path $dataset_path \
|
|
--num-prompts $num_prompts \
|
|
--sonnet-input-len $sonnet_input_len \
|
|
--sonnet-output-len $sonnet_output_len \
|
|
--sonnet-prefix-len $sonnet_prefix_len \
|
|
--port $port \
|
|
--save-result \
|
|
--result-dir $RESULTS_FOLDER \
|
|
--result-filename ${new_test_name}.json \
|
|
--request-rate $qps \
|
|
--ignore-eos \
|
|
$client_args"
|
|
|
|
else
|
|
|
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
echo "Running test case $test_name with qps $qps"
|
|
echo "Client command: $client_command"
|
|
|
|
eval "$client_command"
|
|
|
|
server_command="None"
|
|
|
|
# record the benchmarking commands
|
|
jq_output=$(jq -n \
|
|
--arg server "$server_command" \
|
|
--arg client "$client_command" \
|
|
--arg gpu "$gpu_type" \
|
|
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
|
'{
|
|
server_command: $server,
|
|
client_command: $client,
|
|
gpu_type: $gpu,
|
|
engine: $engine
|
|
}')
|
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
|
|
done
|
|
|
|
done
|
|
|
|
kill_gpu_processes
|
|
}
|
|
|
|
run_genai_perf_tests() {
|
|
# run genai-perf tests
|
|
|
|
# $1: a json file specifying genai-perf test cases
|
|
local genai_perf_test_file
|
|
genai_perf_test_file=$1
|
|
|
|
# Iterate over genai-perf tests
|
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
|
# get the test name, and append the GPU type back to it.
|
|
test_name=$(echo "$params" | jq -r '.test_name')
|
|
|
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
echo "Skip test case $test_name."
|
|
continue
|
|
fi
|
|
|
|
# prepend the current serving engine to the test name
|
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
|
|
|
# get common parameters
|
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
model=$(echo "$common_params" | jq -r '.model')
|
|
tp=$(echo "$common_params" | jq -r '.tp')
|
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
port=$(echo "$common_params" | jq -r '.port')
|
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
|
|
|
# get client and server arguments
|
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
echo "Running over qps list $qps_list"
|
|
|
|
# check if there is enough GPU to run the test
|
|
if [[ $gpu_count -lt $tp ]]; then
|
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
continue
|
|
fi
|
|
|
|
if [[ $reuse_server == "true" ]]; then
|
|
echo "Reuse previous server for test case $test_name"
|
|
else
|
|
kill_gpu_processes
|
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
|
"$server_params" "$common_params"
|
|
fi
|
|
|
|
if wait_for_server; then
|
|
echo ""
|
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
|
else
|
|
echo ""
|
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
|
break
|
|
fi
|
|
|
|
# iterate over different QPS
|
|
for qps in $qps_list; do
|
|
# remove the surrounding single quote from qps
|
|
if [[ "$qps" == *"inf"* ]]; then
|
|
echo "qps was $qps"
|
|
qps=$num_prompts
|
|
echo "now qps is $qps"
|
|
fi
|
|
|
|
new_test_name=$test_name"_qps_"$qps
|
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
|
|
|
if [[ "$backend" == *"vllm"* ]]; then
|
|
backend="vllm"
|
|
fi
|
|
#TODO: add output dir.
|
|
client_command="genai-perf profile \
|
|
-m $model \
|
|
--service-kind openai \
|
|
--backend vllm \
|
|
--endpoint-type chat \
|
|
--streaming \
|
|
--url localhost:$port \
|
|
--request-rate $qps \
|
|
--num-prompts $num_prompts \
|
|
"
|
|
|
|
echo "Client command: $client_command"
|
|
|
|
eval "$client_command"
|
|
|
|
#TODO: process/record outputs
|
|
done
|
|
done
|
|
|
|
kill_gpu_processes
|
|
|
|
}
|
|
|
|
prepare_dataset() {
|
|
|
|
# download sharegpt dataset
|
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
|
|
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
echo "" > sonnet_4x.txt
|
|
for _ in {1..4}
|
|
do
|
|
cat sonnet.txt >> sonnet_4x.txt
|
|
done
|
|
|
|
}
|
|
|
|
main() {
|
|
|
|
# check if the environment variable is successfully injected from yaml
|
|
|
|
check_gpus
|
|
check_hf_token
|
|
get_current_llm_serving_engine
|
|
|
|
pip install -U transformers
|
|
|
|
pip install -r requirements/dev.txt
|
|
which genai-perf
|
|
|
|
# check storage
|
|
df -h
|
|
|
|
ensure_installed wget
|
|
ensure_installed curl
|
|
ensure_installed jq
|
|
# genai-perf dependency
|
|
ensure_installed libb64-0d
|
|
|
|
prepare_dataset
|
|
|
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
declare -g RESULTS_FOLDER=results/
|
|
mkdir -p $RESULTS_FOLDER
|
|
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
|
|
|
# run the test
|
|
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
|
|
|
# run genai-perf tests
|
|
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
|
mv artifacts/ $RESULTS_FOLDER/
|
|
|
|
# upload benchmark results to buildkite
|
|
python3 -m pip install tabulate pandas
|
|
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
|
upload_to_buildkite
|
|
|
|
}
|
|
|
|
main "$@"
|