[ci] Add A100 queue into AWS CI template (#5648)
Signed-off-by: kevin <kevin@anyscale.com>
This commit is contained in:
parent
e9c2732b97
commit
3ee5c4bca5
@ -17,6 +17,7 @@ steps:
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: perf-benchmark
|
||||
containers:
|
||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
|
@ -181,3 +181,8 @@ steps:
|
||||
commands:
|
||||
- pip install -r requirements-docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
|
||||
- label: A100 status
|
||||
gpu: a100
|
||||
commands:
|
||||
- nvidia-smi
|
||||
|
@ -49,6 +49,51 @@ steps:
|
||||
command: bash .buildkite/run-cpu-test.sh
|
||||
|
||||
{% for step in steps %}
|
||||
{% if step.gpu == "a100" %}
|
||||
- label: "{{ step.label }}"
|
||||
agents:
|
||||
queue: a100-queue
|
||||
soft_fail: {{ step.soft_fail or false }}
|
||||
{% if step.parallelism %}
|
||||
parallelism: {{ step.parallelism }}
|
||||
{% endif %}
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 5
|
||||
- exit_status: -10 # Agent was lost
|
||||
limit: 5
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: ci
|
||||
containers:
|
||||
- image: {{ docker_image }}
|
||||
command: ["bash"]
|
||||
args:
|
||||
- '-c'
|
||||
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
{% else %}
|
||||
- label: "{{ step.label }}"
|
||||
agents:
|
||||
{% if step.label == "Documentation Build" %}
|
||||
@ -90,4 +135,5 @@ steps:
|
||||
{% endif %}
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
Loading…
x
Reference in New Issue
Block a user