[perf bench] H200 development (#9768)

Signed-off-by: simon-mo <simon.mo@hey.com>
2024-11-20 11:06:56 -08:00 · 2024-11-20 11:06:56 -08:00 · 5f1d6af2b6
commit 5f1d6af2b6
parent 772a66732d
3 changed files with 32 additions and 7 deletions
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -9,7 +9,9 @@ steps:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+
  - wait
+
  - label: "A100"
    agents:
      queue: A100
@ -41,6 +43,27 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
+
+  - label: "H200"
+    agents:
+      queue: H200
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+
  # - label: "H100"
  #   agents:
  #     queue: H100
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -157,6 +157,11 @@ if __name__ == "__main__":
                                             throughput_results,
                                             serving_results)

+    # Sort all dataframes by their respective "Test name" columns
+    for df in [latency_results, serving_results, throughput_results]:
+        if not df.empty:
+            df.sort_values(by="Test name", inplace=True)
+
    # get markdown tables
    latency_md_table = tabulate(latency_results,
                                headers='keys',
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -6,6 +6,7 @@

 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
+set -x
 set -o pipefail

 check_gpus() {
@ -85,11 +86,7 @@ kill_gpu_processes() {

  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
-  # this line doesn't work now
-  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
-  pkill -f python3
-  pkill -f /usr/bin/python3
+  pgrep python3 | xargs -r kill -9


  # wait until GPU memory usage smaller than 1GB
@ -289,7 +286,7 @@ run_serving_tests() {
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
-    eval "$server_command" &
+    bash -c "$server_command" &
    server_pid=$!

    # wait until the server is alive
@ -322,7 +319,7 @@ run_serving_tests() {
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"

-      eval "$client_command"
+      bash -c "$client_command"

      # record the benchmarking commands
      jq_output=$(jq -n \