Benchmark: add H100 suite (#6047)

2024-07-11 09:17:07 -07:00 · 2024-07-11 09:17:07 -07:00 · 52b7fcb35a
commit 52b7fcb35a
parent b675069d74
2 changed files with 40 additions and 23 deletions
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -11,7 +11,7 @@ steps:
            - sh
            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - wait
-  - label: "A100 Benchmark"
+  - label: "A100"
    agents:
      queue: A100
    plugins:
@ -42,21 +42,20 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
-  # - label: "H100: NVIDIA SMI"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       propagate-uid-gid: false
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
+  - label: "H100"
+    agents:
+      queue: H100
+    plugins:
+    - docker#v5.11.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN

--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@ -54,7 +54,7 @@ wait_for_server() {
  # wait for vllm server to start
  # return 1 if vllm server crashes
  timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
+    until curl -X POST localhost:8000/v1/completions; do
      sleep 1
    done' && return 0 || return 1
 }
@ -73,8 +73,17 @@ kill_gpu_processes() {
      echo "All GPU processes have been killed."
  fi

+  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+  # since we are in container anyway
+  pkill -9 -f python
+  pkill -9 -f python3
+
  # waiting for GPU processes to be fully killed
-  sleep 10
+  # loop while nvidia-smi returns any processes
+  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+    sleep 1
+    echo "Waiting for GPU processes to be killed"
+  done

  # remove vllm config file
  rm -rf ~/.config/vllm
@ -90,12 +99,19 @@ upload_to_buildkite() {
  # upload the benchmarking results to buildkite

  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
    echo "buildkite-agent binary not found. Skip uploading the results."
    return 0
  fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

 run_latency_tests() {
@ -269,6 +285,7 @@ run_serving_tests() {
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    eval "$server_command" &
+    server_pid=$!

    # wait until the server is alive
    wait_for_server
@ -318,6 +335,7 @@ run_serving_tests() {
    done

    # clean up
+    kill -9 $server_pid
    kill_gpu_processes
  done
 }