[ci] Add A100 queue into AWS CI template (#5648)
Signed-off-by: kevin <kevin@anyscale.com>
This commit is contained in:
parent
e9c2732b97
commit
3ee5c4bca5
@ -17,6 +17,7 @@ steps:
|
|||||||
plugins:
|
plugins:
|
||||||
- kubernetes:
|
- kubernetes:
|
||||||
podSpec:
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
containers:
|
containers:
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
command:
|
command:
|
||||||
|
@ -181,3 +181,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install -r requirements-docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
|
|
||||||
|
- label: A100 status
|
||||||
|
gpu: a100
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
@ -49,6 +49,51 @@ steps:
|
|||||||
command: bash .buildkite/run-cpu-test.sh
|
command: bash .buildkite/run-cpu-test.sh
|
||||||
|
|
||||||
{% for step in steps %}
|
{% for step in steps %}
|
||||||
|
{% if step.gpu == "a100" %}
|
||||||
|
- label: "{{ step.label }}"
|
||||||
|
agents:
|
||||||
|
queue: a100-queue
|
||||||
|
soft_fail: {{ step.soft_fail or false }}
|
||||||
|
{% if step.parallelism %}
|
||||||
|
parallelism: {{ step.parallelism }}
|
||||||
|
{% endif %}
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: ci
|
||||||
|
containers:
|
||||||
|
- image: {{ docker_image }}
|
||||||
|
command: ["bash"]
|
||||||
|
args:
|
||||||
|
- '-c'
|
||||||
|
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
{% else %}
|
||||||
- label: "{{ step.label }}"
|
- label: "{{ step.label }}"
|
||||||
agents:
|
agents:
|
||||||
{% if step.label == "Documentation Build" %}
|
{% if step.label == "Documentation Build" %}
|
||||||
@ -90,4 +135,5 @@ steps:
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
volumes:
|
volumes:
|
||||||
- /dev/shm:/dev/shm
|
- /dev/shm:/dev/shm
|
||||||
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user