[perf bench] H200 development (#9768)

Signed-off-by: simon-mo <simon.mo@hey.com>
2024-11-20 11:06:56 -08:00 · 2024-11-20 11:06:56 -08:00 · 5f1d6af2b6
commit 5f1d6af2b6
parent 772a66732d
3 changed files with 32 additions and 7 deletions
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -9,7 +9,9 @@ steps:
          - image: badouralix/curl-jq
            command:
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - wait
  - label: "A100"
    agents:
      queue: A100
@ -41,6 +43,27 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
  - label: "H200"
    agents:
      queue: H200
    plugins:
    - docker#v5.12.0:
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
        command:
        - bash
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
        mount-buildkite-agent: true
        propagate-environment: true
        ipc: host
        gpus: 4,5,6,7
        volumes:
          - /data/benchmark-hf-cache:/root/.cache/huggingface
        environment:
        - VLLM_USAGE_SOURCE
        - HF_TOKEN
  # - label: "H100"
  #   agents:
  #     queue: H100
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -157,6 +157,11 @@ if __name__ == "__main__":
                                             throughput_results,
                                             serving_results)
    # Sort all dataframes by their respective "Test name" columns
    for df in [latency_results, serving_results, throughput_results]:
        if not df.empty:
            df.sort_values(by="Test name", inplace=True)
    # get markdown tables
    latency_md_table = tabulate(latency_results,
                                headers='keys',
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -6,6 +6,7 @@
 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
 set -x
 set -o pipefail
 check_gpus() {
@ -85,11 +86,7 @@ kill_gpu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
+  pgrep python3 | xargs -r kill -9
  # this line doesn't work now
  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
  pkill -f python3
  pkill -f /usr/bin/python3
  # wait until GPU memory usage smaller than 1GB
@ -289,7 +286,7 @@ run_serving_tests() {
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
-    eval "$server_command" &
+    bash -c "$server_command" &
    server_pid=$!
    # wait until the server is alive
@ -322,7 +319,7 @@ run_serving_tests() {
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
-      eval "$client_command"
+      bash -c "$client_command"
      # record the benchmarking commands
      jq_output=$(jq -n \