[perf bench] H200 development (#9768)
Signed-off-by: simon-mo <simon.mo@hey.com>
This commit is contained in:
parent
772a66732d
commit
5f1d6af2b6
@ -9,7 +9,9 @@ steps:
|
||||
- image: badouralix/curl-jq
|
||||
command:
|
||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
|
||||
- wait
|
||||
|
||||
- label: "A100"
|
||||
agents:
|
||||
queue: A100
|
||||
@ -41,6 +43,27 @@ steps:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
- label: "H200"
|
||||
agents:
|
||||
queue: H200
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: 4,5,6,7
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
|
||||
# - label: "H100"
|
||||
# agents:
|
||||
# queue: H100
|
||||
|
@ -157,6 +157,11 @@ if __name__ == "__main__":
|
||||
throughput_results,
|
||||
serving_results)
|
||||
|
||||
# Sort all dataframes by their respective "Test name" columns
|
||||
for df in [latency_results, serving_results, throughput_results]:
|
||||
if not df.empty:
|
||||
df.sort_values(by="Test name", inplace=True)
|
||||
|
||||
# get markdown tables
|
||||
latency_md_table = tabulate(latency_results,
|
||||
headers='keys',
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
||||
# and we still want to see other benchmarking results even when mixtral crashes.
|
||||
set -x
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
@ -85,11 +86,7 @@ kill_gpu_processes() {
|
||||
|
||||
ps -aux
|
||||
lsof -t -i:8000 | xargs -r kill -9
|
||||
pkill -f pt_main_thread
|
||||
# this line doesn't work now
|
||||
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
|
||||
pkill -f python3
|
||||
pkill -f /usr/bin/python3
|
||||
pgrep python3 | xargs -r kill -9
|
||||
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
@ -289,7 +286,7 @@ run_serving_tests() {
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
bash -c "$server_command" &
|
||||
server_pid=$!
|
||||
|
||||
# wait until the server is alive
|
||||
@ -322,7 +319,7 @@ run_serving_tests() {
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
bash -c "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
|
Loading…
x
Reference in New Issue
Block a user