vllm/.buildkite/test-pipeline.yaml

# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template-aws.j2` to generate
# the final pipeline yaml file.

steps:
- label: Regression Test
  mirror_hardwares: [amd]
  command: pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: AsyncEngine Test
  #mirror_hardwares: [amd]
  command: pytest -v -s async_engine

- label: Basic Correctness Test
  mirror_hardwares: [amd]
  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test
  mirror_hardwares: [amd]
  command: pytest -v -s core

- label: Distributed Comm Ops Test
  #mirror_hardwares: [amd]
  command: pytest -v -s distributed/test_comm_ops.py
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2

- label: Distributed Tests (2 GPUs)
  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  commands:
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - pytest -v -s spec_decode/e2e/test_integration_dist.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Distributed Tests (4 GPUs)
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  commands:
  - pytest -v -s distributed/test_pynccl.py
  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py

- label: Engine Test
  mirror_hardwares: [amd]
  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py

- label: Entrypoints Test
  mirror_hardwares: [amd]

  commands:
  - pytest -v -s entrypoints -m llm
  - pytest -v -s entrypoints -m openai

- label: Examples Test
  working_dir: "/vllm-workspace/examples"
  mirror_hardwares: [amd]
  commands:
    # install aws cli for llava_example.py
    # install tensorizer for tensorize_vllm_model.py
    - pip install awscli tensorizer
    - python3 offline_inference.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
    - python3 llava_example.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

- label: Inputs Test
  #mirror_hardwares: [amd]
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s test_inputs.py
    - pytest -v -s multimodal

- label: Kernels Test %N
  #mirror_hardwares: [amd]
  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4

- label: Models Test
  #mirror_hardwares: [amd]
  commands:
    - pytest -v -s models -m \"not vlm\"

- label: Vision Language Models Test
  mirror_hardwares: [amd]
  commands:
    - bash ../.buildkite/download-images.sh
    - pytest -v -s models -m vlm

- label: Prefix Caching Test
  mirror_hardwares: [amd]
  commands:
    - pytest -v -s prefix_caching

- label: Samplers Test
  #mirror_hardwares: [amd]
  command: pytest -v -s samplers

- label: LogitsProcessor Test
  mirror_hardwares: [amd]
  command: pytest -v -s test_logits_processor.py

- label: Utils Test
  command: pytest -v -s test_utils.py

- label: Worker Test
  mirror_hardwares: [amd]
  command: pytest -v -s worker

- label: Speculative decoding tests
  #mirror_hardwares: [amd]
  commands:
    # See https://github.com/vllm-project/vllm/issues/5152
    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode

- label: LoRA Test %N
  #mirror_hardwares: [amd]
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
  parallelism: 4

- label: LoRA Long Context (Distributed)
  #mirror_hardwares: [amd]
  num_gpus: 4
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  commands:
    - pytest -v -s -x lora/test_long_context.py

- label: Tensorizer Test
  #mirror_hardwares: [amd]
  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader

- label: Metrics Test
  mirror_hardwares: [amd]
  command: pytest -v -s metrics

- label: Quantization Test
  #mirror_hardwares: [amd]
  command: pytest -v -s quantization

- label: Tracing Test
  commands: 
    - "pip install \
        opentelemetry-sdk \
        opentelemetry-api \
        opentelemetry-exporter-otlp \
        opentelemetry-semantic-conventions-ai"
    - pytest -v -s tracing

- label: Benchmarks
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh

- label: Documentation Build
  working_dir: "/vllm-workspace/test_docs/docs"
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html