#!/bin/bash set -o pipefail set -x check_gpus() { # check the number of GPUs and GPU type. declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) if [[ $gpu_count -gt 0 ]]; then echo "GPU found." else echo "Need at least 1 GPU to run benchmarking." exit 1 fi declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" echo "GPU type is $gpu_type" } check_hf_token() { # check if HF_TOKEN is available and valid if [[ -z "$HF_TOKEN" ]]; then echo "Error: HF_TOKEN is not set." exit 1 elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then echo "Error: HF_TOKEN does not start with 'hf_'." exit 1 else echo "HF_TOKEN is set and valid." fi } upload_to_buildkite() { # upload the benchmarking results to buildkite # if the agent binary is not found, skip uploading the results, exit 0 if [ ! -f /workspace/buildkite-agent ]; then echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } get_current_llm_serving_engine() { if which lmdeploy >/dev/null; then echo "Container: lmdeploy" export CURRENT_LLM_SERVING_ENGINE=lmdeploy return fi if [ -e /tgi-entrypoint.sh ]; then echo "Container: tgi" export CURRENT_LLM_SERVING_ENGINE=tgi return fi if which trtllm-build >/dev/null; then echo "Container: tensorrt-llm" export CURRENT_LLM_SERVING_ENGINE=trt return fi if [ -e /sgl-workspace ]; then echo "Container: sglang" export CURRENT_LLM_SERVING_ENGINE=sglang return fi if [ -e /vllm-workspace ]; then echo "Container: vllm" # move to a completely irrelevant directory, to avoid import vllm from current folder export CURRENT_LLM_SERVING_ENGINE=vllm return fi } json2args() { # transforms the JSON string to command line args, and '_' is replaced to '-' # example: # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 local json_string=$1 local args=$( echo "$json_string" | jq -r ' to_entries | map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | join(" ") ' ) echo "$args" } kill_gpu_processes() { pkill -f python pkill -f python3 pkill -f tritonserver pkill -f pt_main_thread pkill -f text-generation pkill -f lmdeploy while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 done } wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes timeout 1200 bash -c ' until curl -s localhost:8000/v1/completions > /dev/null; do sleep 1 done' && return 0 || return 1 } ensure_installed() { # Ensure that the given command is installed by apt-get local cmd=$1 if ! which "$cmd" >/dev/null; then apt-get update && apt-get install -y "$cmd" fi } run_serving_tests() { # run serving tests using `benchmark_serving.py` # $1: a json file specifying serving test cases local serving_test_file serving_test_file=$1 # Iterate over serving tests jq -c '.[]' "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." continue fi # prepend the current serving engine to the test name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} # get common parameters common_params=$(echo "$params" | jq -r '.common_parameters') model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') dataset_name=$(echo "$common_params" | jq -r '.dataset_name') dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') num_prompts=$(echo "$common_params" | jq -r '.num_prompts') reuse_server=$(echo "$common_params" | jq -r '.reuse_server') # get client and server arguments server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") client_args=$(json2args "$client_params") qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" # check if there is enough GPU to run the test if [[ $gpu_count -lt $tp ]]; then echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi if [[ $reuse_server == "true" ]]; then echo "Reuse previous server for test case $test_name" else kill_gpu_processes bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ "$server_params" "$common_params" fi if wait_for_server; then echo "" echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." else echo "" echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." break fi # prepare tokenizer # this is required for lmdeploy. cd "$VLLM_SOURCE_CODE_LOC/benchmarks" rm -rf /tokenizer_cache mkdir /tokenizer_cache python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ --model "$model" \ --cachedir /tokenizer_cache cd "$VLLM_SOURCE_CODE_LOC/benchmarks" # change model name for lmdeploy (it will not follow standard hf name) if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) fi # iterate over different QPS for qps in $qps_list; do # remove the surrounding single quote from qps if [[ "$qps" == *"inf"* ]]; then echo "qps was $qps" qps="inf" echo "now qps is $qps" fi new_test_name=$test_name"_qps_"$qps backend=$CURRENT_LLM_SERVING_ENGINE if [[ $backend = "trt" ]]; then backend="tensorrt-llm" fi if [[ "$backend" == *"vllm"* ]]; then backend="vllm" fi if [[ "$dataset_name" = "sharegpt" ]]; then client_command="python3 benchmark_serving.py \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ --dataset-name $dataset_name \ --dataset-path $dataset_path \ --num-prompts $num_prompts \ --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ --request-rate $qps \ --ignore-eos \ $client_args" elif [[ "$dataset_name" = "sonnet" ]]; then sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') client_command="python3 benchmark_serving.py \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ --dataset-name $dataset_name \ --dataset-path $dataset_path \ --num-prompts $num_prompts \ --sonnet-input-len $sonnet_input_len \ --sonnet-output-len $sonnet_output_len \ --sonnet-prefix-len $sonnet_prefix_len \ --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ --request-rate $qps \ --ignore-eos \ $client_args" else echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." exit 1 fi echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" eval "$client_command" server_command="None" # record the benchmarking commands jq_output=$(jq -n \ --arg server "$server_command" \ --arg client "$client_command" \ --arg gpu "$gpu_type" \ --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ '{ server_command: $server, client_command: $client, gpu_type: $gpu, engine: $engine }') echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" done done kill_gpu_processes } run_genai_perf_tests() { # run genai-perf tests # $1: a json file specifying genai-perf test cases local genai_perf_test_file genai_perf_test_file=$1 # Iterate over genai-perf tests jq -c '.[]' "$genai_perf_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." continue fi # prepend the current serving engine to the test name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} # get common parameters common_params=$(echo "$params" | jq -r '.common_parameters') model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') dataset_name=$(echo "$common_params" | jq -r '.dataset_name') dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') num_prompts=$(echo "$common_params" | jq -r '.num_prompts') reuse_server=$(echo "$common_params" | jq -r '.reuse_server') # get client and server arguments server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" # check if there is enough GPU to run the test if [[ $gpu_count -lt $tp ]]; then echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi if [[ $reuse_server == "true" ]]; then echo "Reuse previous server for test case $test_name" else kill_gpu_processes bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ "$server_params" "$common_params" fi if wait_for_server; then echo "" echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." else echo "" echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." break fi # iterate over different QPS for qps in $qps_list; do # remove the surrounding single quote from qps if [[ "$qps" == *"inf"* ]]; then echo "qps was $qps" qps=$num_prompts echo "now qps is $qps" fi new_test_name=$test_name"_qps_"$qps backend=$CURRENT_LLM_SERVING_ENGINE if [[ "$backend" == *"vllm"* ]]; then backend="vllm" fi #TODO: add output dir. client_command="genai-perf profile \ -m $model \ --service-kind openai \ --backend vllm \ --endpoint-type chat \ --streaming \ --url localhost:$port \ --request-rate $qps \ --num-prompts $num_prompts \ " echo "Client command: $client_command" eval "$client_command" #TODO: process/record outputs done done kill_gpu_processes } prepare_dataset() { # download sharegpt dataset cd "$VLLM_SOURCE_CODE_LOC/benchmarks" wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # duplicate sonnet by 4x, to allow benchmarking with input length 2048 cd "$VLLM_SOURCE_CODE_LOC/benchmarks" echo "" > sonnet_4x.txt for _ in {1..4} do cat sonnet.txt >> sonnet_4x.txt done } main() { # check if the environment variable is successfully injected from yaml check_gpus check_hf_token get_current_llm_serving_engine pip install -U transformers pip install -r requirements/dev.txt which genai-perf # check storage df -h ensure_installed wget ensure_installed curl ensure_installed jq # genai-perf dependency ensure_installed libb64-0d prepare_dataset cd "$VLLM_SOURCE_CODE_LOC/benchmarks" declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" # run the test run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" # run genai-perf tests run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" mv artifacts/ $RESULTS_FOLDER/ # upload benchmark results to buildkite python3 -m pip install tabulate pandas python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" upload_to_buildkite } main "$@"