From d5b1eb081e193c54ac21390a0f6ba7013e4f3b11 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 5 Jun 2024 11:42:08 -0500 Subject: [PATCH] [CI] Add nightly benchmarks (#5260) --- .../nightly-benchmarks/kickoff-pipeline.sh | 26 +++++++++++++ .buildkite/nightly-benchmarks/sample.yaml | 39 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100755 .buildkite/nightly-benchmarks/kickoff-pipeline.sh create mode 100644 .buildkite/nightly-benchmarks/sample.yaml diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh new file mode 100755 index 00000000..d3bf3b72 --- /dev/null +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Install system packages +apt update +apt install -y curl jq + +# Install minijinja for templating +curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh +source $HOME/.cargo/env + +# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq +if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then + PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') + + if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then + echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks." + else + echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks." + exit 0 + fi +fi + +# Upload sample.yaml +buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml diff --git a/.buildkite/nightly-benchmarks/sample.yaml b/.buildkite/nightly-benchmarks/sample.yaml new file mode 100644 index 00000000..50e6e820 --- /dev/null +++ b/.buildkite/nightly-benchmarks/sample.yaml @@ -0,0 +1,39 @@ +steps: + # NOTE(simon): You can create separate blocks for different jobs + - label: "A100: NVIDIA SMI" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT + # TODO(simon): check latest main branch or use the PR image. + - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 + command: + - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls' + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + # TODO(simon): bring H100 online + # - label: "H100: NVIDIA SMI" + # agents: + # queue: H100 + # plugins: + # - docker#v5.11.0: + # image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 + # command: + # - bash -c 'nvidia-smi && nvidia-smi topo -m' + # propagate-environment: true + # ipc: host + # gpus: all +