vllm/.buildkite/test-pipeline.yaml

# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.

# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
# to generate the final pipeline yaml file.

# Documentation
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
#     in this case, commands must be specified. the first command runs on first host, the second
#     command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.

# When adding a test
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step. 
#   Note that all steps execute in parallel. 

steps:
##### fast check tests  #####

- label: Documentation Build # 2min
  working_dir: "/vllm-workspace/test_docs/docs"
  fast_check: true
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
  - grep \"sig sig-object py\" build/html/dev/sampling_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 15min
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/async_engine
  - tests/test_inputs
  - tests/multimodal
  - tests/test_utils
  - tests/worker
  commands:
  - pytest -v -s async_engine # Async Engine
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker

- label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness
  commands:
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
  
- label: Core Test # 10min
  mirror_hardwares: [amd]
  fast_check: true
  source_file_dependencies:
  - vllm/core
  - vllm/distributed
  - tests/core
  commands:
  - pytest -v -s core

- label: Entrypoints Test # 20min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  commands:
  - pip install -e ./plugins/vllm_add_dummy_model
  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/openai
  - pytest -v -s entrypoints/test_chat_utils.py


- label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  fast_check: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/core/
  - tests/distributed
  - tests/spec_decode/e2e/test_integration_dist_tp4
  commands:
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py

- label: Metrics, Tracing Test # 10min
  num_gpus: 2 
  fast_check: true
  source_file_dependencies:
  - vllm/
  - tests/metrics
  - tests/tracing
  commands:
  - pytest -v -s metrics 
  - "pip install \
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
      'opentelemetry-api>=1.26.0,<1.27.0' \
      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing

##### fast check tests  #####
#####  1 GPU test  #####

- label: Regression Test # 5min
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  command: pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 10min
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/tokenization
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: Examples Test # 12min
  working_dir: "/vllm-workspace/examples"
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
  commands:
    - pip install awscli tensorizer # for llava example and tensorizer test
    - python3 offline_inference.py
    - python3 cpu_offload.py
    - python3 offline_inference_chat.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
    - python3 offline_inference_vision_language.py
    - python3 offline_inference_vision_language_multi_image.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py

- label: Models Test # 1hr10min
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pip install -e ./plugins/vllm_add_dummy_model
    - pytest -v -s models/test_oot_registration.py # it needs a clean process
    - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py

- label: torch compile integration test
  source_file_dependencies:
  - vllm/
  commands:
    - pytest -v -s ./compile/test_full_graph.py
    - pytest -v -s ./compile/test_wrapper.py


- label: Vision Language Models Test # 42min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  commands:
    - pytest -v -s models -m vlm

- label: Prefix Caching Test # 7min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
  commands:
    - pytest -v -s prefix_caching

- label: Samplers Test # 18min
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - tests/samplers
  commands:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

- label: LogitsProcessor Test # 5min
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/model_executor/layers
  - tests/test_logits_processor
  command: pytest -v -s test_logits_processor.py

- label: Speculative decoding tests # 22min
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
  commands:
    # See https://github.com/vllm-project/vllm/issues/5152
    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode

- label: LoRA Test %N # 30min each
  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
  parallelism: 4

- label: Kernels Test %N # 30min each
  source_file_dependencies:
  - csrc/
  - vllm/attention
  - tests/kernels
  commands:
    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4

- label: Tensorizer Test # 11min
  mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
  - tests/tensorizer_loader
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s tensorizer_loader

- label: Benchmarks # 9min
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
  source_file_dependencies:
  - benchmarks/
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh

- label: Quantization Test # 15min
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
  command: pytest -v -s quantization

- label: LM Eval Small Models # 53min
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1

- label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
  mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
  commands:
    - pytest -v -s tool_use

#####  1 GPU test  #####
#####  multi gpus test  #####

- label: Distributed Comm Ops Test # 7min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py

- label: 2 Node Tests (4 GPUs in total) # 16min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py

- label: Distributed Tests (2 GPUs) # 28min
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
  - pytest -v -s distributed/test_chunked_prefill_distributed.py
  - pytest -v -s distributed/test_multimodal_broadcast.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Multi-step Tests (4 GPUs) # 21min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/model_executor/layers/sampler.py
  - vllm/sequence.py
  - vllm/worker/worker_base.py
  - vllm/worker/worker.py
  - vllm/worker/multi_step_worker.py
  - vllm/worker/model_runner_base.py
  - vllm/worker/model_runner.py
  - vllm/worker/multi_step_model_runner.py
  - vllm/engine
  - tests/multi_step
  commands:
  - pytest -v -s multi_step/test_correctness_async_llm.py
  - pytest -v -s multi_step/test_correctness_llm.py

- label: Pipeline Parallelism Test # 23min
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py

- label: LoRA Long Context (Distributed) # 11min
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  num_gpus: 4
  soft_fail: true
  source_file_dependencies:
  - vllm/lora
  - tests/lora/test_long_context
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s -x lora/test_long_context.py

- label: Weight Loading Multiple GPU Test
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh


##### multi gpus test #####
##### A100 test #####

- label: Distributed Tests (A100) # optional
  gpu: a100
  num_gpus: 4
  source_file_dependencies:
  - vllm/
  commands: 
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s -x lora/test_mixtral.py

- label: LM Eval Large Models # optional
  gpu: a100
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4