From 3ee5c4bca514ee95592a018fae95e050fd6763c0 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 19 Jun 2024 07:42:13 -0700 Subject: [PATCH] [ci] Add A100 queue into AWS CI template (#5648) Signed-off-by: kevin --- .../benchmark-pipeline.yaml | 1 + .buildkite/test-pipeline.yaml | 5 ++ .buildkite/test-template-aws.j2 | 46 +++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 8f12748b..2b25c954 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -17,6 +17,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT command: diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5afe3730..c2160fee 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -181,3 +181,8 @@ steps: commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html + +- label: A100 status + gpu: a100 + commands: + - nvidia-smi diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 01f7ff1e..08146bf4 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -49,6 +49,51 @@ steps: command: bash .buildkite/run-cpu-test.sh {% for step in steps %} + {% if step.gpu == "a100" %} + - label: "{{ step.label }}" + agents: + queue: a100-queue + soft_fail: {{ step.soft_fail or false }} + {% if step.parallelism %} + parallelism: {{ step.parallelism }} + {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + plugins: + - kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: {{ docker_image }} + command: ["bash"] + args: + - '-c' + - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + {% else %} - label: "{{ step.label }}" agents: {% if step.label == "Documentation Build" %} @@ -90,4 +135,5 @@ steps: {% endif %} volumes: - /dev/shm:/dev/shm + {% endif %} {% endfor %}