[ci] Add A100 queue into AWS CI template (#5648)

Signed-off-by: kevin <kevin@anyscale.com>
This commit is contained in:
Kevin H. Luu 2024-06-19 07:42:13 -07:00 committed by GitHub
parent e9c2732b97
commit 3ee5c4bca5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 52 additions and 0 deletions

View File

@ -17,6 +17,7 @@ steps:
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:

View File

@ -181,3 +181,8 @@ steps:
commands:
- pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html
- label: A100 status
gpu: a100
commands:
- nvidia-smi

View File

@ -49,6 +49,51 @@ steps:
command: bash .buildkite/run-cpu-test.sh
{% for step in steps %}
{% if step.gpu == "a100" %}
- label: "{{ step.label }}"
agents:
queue: a100-queue
soft_fail: {{ step.soft_fail or false }}
{% if step.parallelism %}
parallelism: {{ step.parallelism }}
{% endif %}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
plugins:
- kubernetes:
podSpec:
priorityClassName: ci
containers:
- image: {{ docker_image }}
command: ["bash"]
args:
- '-c'
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
{% else %}
- label: "{{ step.label }}"
agents:
{% if step.label == "Documentation Build" %}
@ -90,4 +135,5 @@ steps:
{% endif %}
volumes:
- /dev/shm:/dev/shm
{% endif %}
{% endfor %}