2024-01-14 12:37:58 -08:00
|
|
|
# In this file, you can add more tests to run either by adding a new step or
|
|
|
|
# adding a new command to an existing step. See different options here for examples.
|
2024-06-24 21:09:02 -07:00
|
|
|
|
|
|
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
2025-02-19 01:06:23 -08:00
|
|
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
2024-06-24 21:09:02 -07:00
|
|
|
# to generate the final pipeline yaml file.
|
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
# Documentation
|
|
|
|
# label(str): the name of the test. emoji allowed.
|
|
|
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
|
|
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
2024-11-19 19:36:03 -10:00
|
|
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
2024-08-05 17:39:22 -07:00
|
|
|
# command(str): the single command to run for tests. incompatible with commands.
|
|
|
|
# commands(list): the list of commands to run for test. incompatbile with command.
|
|
|
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
|
|
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
|
|
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
2025-02-19 01:06:23 -08:00
|
|
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
2024-08-05 17:39:22 -07:00
|
|
|
# in this case, commands must be specified. the first command runs on first host, the second
|
|
|
|
# command runs on the second host.
|
|
|
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
|
|
|
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
|
|
|
|
|
|
|
|
# When adding a test
|
|
|
|
# - If the test belong to an existing group, add it there
|
|
|
|
# - If the test is short, add to any existing step
|
2025-02-19 01:06:23 -08:00
|
|
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
|
|
|
# Note that all steps execute in parallel.
|
2024-01-14 12:37:58 -08:00
|
|
|
|
|
|
|
steps:
|
2024-08-05 17:39:22 -07:00
|
|
|
##### fast check tests #####
|
|
|
|
|
|
|
|
- label: Documentation Build # 2min
|
|
|
|
working_dir: "/vllm-workspace/test_docs/docs"
|
2024-07-12 09:58:38 -07:00
|
|
|
fast_check: true
|
2024-08-05 17:39:22 -07:00
|
|
|
no_gpu: True
|
|
|
|
commands:
|
2025-03-08 17:44:35 +01:00
|
|
|
- pip install -r ../../requirements/docs.txt
|
2024-08-05 17:39:22 -07:00
|
|
|
- SPHINXOPTS=\"-W\" make html
|
2024-08-14 11:37:30 +08:00
|
|
|
# Check API reference (if it fails, you may have missing mock imports)
|
2025-01-09 17:43:40 +08:00
|
|
|
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
2024-08-05 17:39:22 -07:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
2024-09-18 09:56:58 -04:00
|
|
|
- tests/mq_llm_engine
|
2024-08-05 17:39:22 -07:00
|
|
|
- tests/async_engine
|
|
|
|
- tests/test_inputs
|
|
|
|
- tests/multimodal
|
|
|
|
- tests/test_utils
|
|
|
|
- tests/worker
|
2025-02-03 13:40:25 +08:00
|
|
|
- tests/standalone_tests/lazy_imports.py
|
2024-07-12 09:58:38 -07:00
|
|
|
commands:
|
2025-02-03 13:40:25 +08:00
|
|
|
- python3 standalone_tests/lazy_imports.py
|
2024-09-18 09:56:58 -04:00
|
|
|
- pytest -v -s mq_llm_engine # MQLLMEngine
|
|
|
|
- pytest -v -s async_engine # AsyncLLMEngine
|
2024-09-12 20:02:00 +01:00
|
|
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
2024-07-12 09:58:38 -07:00
|
|
|
- pytest -v -s test_inputs.py
|
|
|
|
- pytest -v -s multimodal
|
|
|
|
- pytest -v -s test_utils.py # Utils
|
|
|
|
- pytest -v -s worker # Worker
|
|
|
|
|
2024-12-05 08:54:47 -08:00
|
|
|
- label: Python-only Installation Test
|
|
|
|
source_file_dependencies:
|
|
|
|
- tests/standalone_tests/python_only_compile.sh
|
|
|
|
- setup.py
|
|
|
|
commands:
|
|
|
|
- bash standalone_tests/python_only_compile.sh
|
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: Basic Correctness Test # 30min
|
2024-08-16 22:25:32 -05:00
|
|
|
#mirror_hardwares: [amd]
|
2024-07-12 09:58:38 -07:00
|
|
|
fast_check: true
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
2024-09-30 17:47:08 -07:00
|
|
|
- tests/basic_correctness/test_basic_correctness
|
|
|
|
- tests/basic_correctness/test_cpu_offload
|
|
|
|
- tests/basic_correctness/test_preemption
|
2025-01-22 14:39:32 +08:00
|
|
|
- tests/basic_correctness/test_cumem.py
|
2024-04-13 01:56:57 +09:00
|
|
|
commands:
|
2025-03-05 20:08:02 -05:00
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
2025-01-22 14:39:32 +08:00
|
|
|
- pytest -v -s basic_correctness/test_cumem.py
|
2024-07-14 21:20:51 -07:00
|
|
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
2024-07-18 16:41:06 -07:00
|
|
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
2024-09-30 17:47:08 -07:00
|
|
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
|
|
|
|
|
|
|
- label: Chunked Prefill Test
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/basic_correctness/test_chunked_prefill
|
|
|
|
commands:
|
2024-10-17 11:38:15 -05:00
|
|
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
|
|
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
2024-09-25 10:35:52 -04:00
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: Core Test # 10min
|
2024-05-02 14:29:07 -05:00
|
|
|
mirror_hardwares: [amd]
|
2024-07-12 09:58:38 -07:00
|
|
|
fast_check: true
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/core
|
|
|
|
- vllm/distributed
|
|
|
|
- tests/core
|
2024-07-18 19:15:52 -07:00
|
|
|
commands:
|
2024-10-17 11:38:15 -05:00
|
|
|
- pytest -v -s core
|
2024-02-18 16:44:50 -08:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Entrypoints Test # 40min
|
2024-08-13 16:24:17 -07:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
2024-08-05 17:39:22 -07:00
|
|
|
fast_check: true
|
2025-04-03 13:05:17 -05:00
|
|
|
#mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
2024-08-12 10:18:03 -07:00
|
|
|
- vllm/
|
2025-02-11 20:38:10 -08:00
|
|
|
- tests/entrypoints/llm
|
|
|
|
- tests/entrypoints/openai
|
|
|
|
- tests/entrypoints/test_chat_utils
|
|
|
|
- tests/entrypoints/offline_mode
|
2024-03-27 00:33:26 -07:00
|
|
|
commands:
|
2025-03-05 20:08:02 -05:00
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
2025-01-17 20:47:01 +08:00
|
|
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
2024-08-24 00:51:38 -07:00
|
|
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
2024-09-25 10:26:37 -04:00
|
|
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
|
|
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
2025-03-15 01:02:20 -04:00
|
|
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
2025-04-18 09:38:27 +05:30
|
|
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
|
2024-08-31 16:35:53 -07:00
|
|
|
- pytest -v -s entrypoints/test_chat_utils.py
|
2025-03-15 01:02:20 -04:00
|
|
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
2024-01-14 12:37:58 -08:00
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: Distributed Tests (4 GPUs) # 10min
|
2024-05-13 22:50:09 +08:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
2024-05-01 21:28:21 -07:00
|
|
|
num_gpus: 4
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
2024-08-22 00:54:15 -07:00
|
|
|
- vllm/distributed/
|
|
|
|
- vllm/core/
|
2025-02-11 20:38:10 -08:00
|
|
|
- tests/distributed/test_utils
|
|
|
|
- tests/distributed/test_pynccl
|
2024-08-05 17:39:22 -07:00
|
|
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
2025-02-11 20:38:10 -08:00
|
|
|
- tests/compile/test_basic_correctness
|
2025-01-16 20:19:52 +08:00
|
|
|
- examples/offline_inference/rlhf.py
|
2025-02-10 10:28:59 +08:00
|
|
|
- examples/offline_inference/rlhf_colocate.py
|
2025-02-22 19:28:59 +08:00
|
|
|
- tests/examples/offline_inference/data_parallel.py
|
2025-03-27 16:14:41 -07:00
|
|
|
- tests/v1/test_async_llm_dp.py
|
2024-05-01 21:28:21 -07:00
|
|
|
commands:
|
2025-03-21 00:30:04 +08:00
|
|
|
# test with tp=2 and external_dp=2
|
|
|
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
|
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
|
|
# test with internal dp
|
2025-03-15 01:02:20 -04:00
|
|
|
- python3 ../examples/offline_inference/data_parallel.py
|
2025-03-27 16:14:41 -07:00
|
|
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
2024-11-06 16:42:09 -08:00
|
|
|
- pytest -v -s distributed/test_utils.py
|
2024-10-10 12:39:36 -07:00
|
|
|
- pytest -v -s compile/test_basic_correctness.py
|
2024-05-13 22:50:09 +08:00
|
|
|
- pytest -v -s distributed/test_pynccl.py
|
2024-06-25 18:56:06 +09:00
|
|
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
2025-01-16 20:19:52 +08:00
|
|
|
# TODO: create a dedicated test section for multi-GPU example tests
|
|
|
|
# when we have multiple distributed example tests
|
2025-03-07 00:32:46 +08:00
|
|
|
- pushd ../examples/offline_inference
|
2025-03-29 05:39:14 -05:00
|
|
|
- python3 rlhf.py
|
|
|
|
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
2025-03-07 00:32:46 +08:00
|
|
|
- popd
|
2024-05-01 21:28:21 -07:00
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: Metrics, Tracing Test # 10min
|
2025-04-03 13:05:17 -05:00
|
|
|
mirror_hardwares: [amd]
|
2025-02-19 01:06:23 -08:00
|
|
|
num_gpus: 2
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/metrics
|
|
|
|
- tests/tracing
|
2024-07-02 10:58:08 -07:00
|
|
|
commands:
|
2025-02-19 01:06:23 -08:00
|
|
|
- pytest -v -s metrics
|
2024-08-05 17:39:22 -07:00
|
|
|
- pytest -v -s tracing
|
2024-07-02 10:58:08 -07:00
|
|
|
|
2024-08-16 13:46:01 -07:00
|
|
|
##### fast check tests #####
|
|
|
|
##### 1 GPU test #####
|
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: Regression Test # 5min
|
2025-04-03 13:05:17 -05:00
|
|
|
#mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/test_regression
|
2024-10-03 19:56:58 -07:00
|
|
|
commands:
|
|
|
|
- pip install modelscope
|
|
|
|
- pytest -v -s test_regression.py
|
2024-08-05 17:39:22 -07:00
|
|
|
working_dir: "/vllm-workspace/tests" # optional
|
|
|
|
|
|
|
|
- label: Engine Test # 10min
|
|
|
|
mirror_hardwares: [amd]
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/engine
|
|
|
|
- tests/tokenization
|
2025-02-11 20:38:10 -08:00
|
|
|
- tests/test_sequence
|
|
|
|
- tests/test_config
|
|
|
|
- tests/test_logger
|
2024-07-18 19:15:52 -07:00
|
|
|
commands:
|
2024-07-11 18:02:29 -04:00
|
|
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
|
|
|
# OOM in the CI unless we run this separately
|
|
|
|
- pytest -v -s tokenization
|
2024-01-14 12:37:58 -08:00
|
|
|
|
2024-11-11 18:05:38 -05:00
|
|
|
- label: V1 Test
|
|
|
|
#mirror_hardwares: [amd]
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/v1
|
|
|
|
commands:
|
2025-01-28 02:19:24 +08:00
|
|
|
# split the test to avoid interference
|
2025-03-15 01:02:20 -04:00
|
|
|
- pytest -v -s v1/core
|
|
|
|
- pytest -v -s v1/engine
|
2025-03-17 11:42:45 -04:00
|
|
|
- pytest -v -s v1/entrypoints
|
2025-03-15 01:02:20 -04:00
|
|
|
- pytest -v -s v1/sample
|
|
|
|
- pytest -v -s v1/worker
|
|
|
|
- pytest -v -s v1/structured_output
|
|
|
|
- pytest -v -s v1/test_stats.py
|
|
|
|
- pytest -v -s v1/test_utils.py
|
|
|
|
- pytest -v -s v1/test_oracle.py
|
2025-01-28 02:19:24 +08:00
|
|
|
# TODO: accuracy does not match, whether setting
|
|
|
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
2025-03-15 01:02:20 -04:00
|
|
|
- pytest -v -s v1/e2e
|
2025-02-07 18:07:03 -05:00
|
|
|
# Integration test for streaming correctness (requires special branch).
|
|
|
|
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
2025-02-13 16:23:45 +01:00
|
|
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
2024-11-11 18:05:38 -05:00
|
|
|
|
2024-12-13 18:40:07 +08:00
|
|
|
- label: Examples Test # 25min
|
2024-03-28 14:36:10 -07:00
|
|
|
working_dir: "/vllm-workspace/examples"
|
2024-08-16 22:25:32 -05:00
|
|
|
#mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/entrypoints
|
|
|
|
- examples/
|
2024-03-28 14:36:10 -07:00
|
|
|
commands:
|
2024-12-13 18:40:07 +08:00
|
|
|
- pip install tensorizer # for tensorizer test
|
2025-02-20 12:53:51 +00:00
|
|
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
|
|
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
|
|
|
- python3 offline_inference/basic/chat.py
|
2025-01-10 15:50:29 +00:00
|
|
|
- python3 offline_inference/prefix_caching.py
|
2025-01-08 13:09:53 +00:00
|
|
|
- python3 offline_inference/llm_engine_example.py
|
2025-03-17 18:00:17 +08:00
|
|
|
- python3 offline_inference/audio_language.py --seed 0
|
|
|
|
- python3 offline_inference/vision_language.py --seed 0
|
|
|
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
|
|
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
2025-03-15 01:02:20 -04:00
|
|
|
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
2025-01-10 15:50:29 +00:00
|
|
|
- python3 offline_inference/encoder_decoder.py
|
2025-03-17 18:00:17 +08:00
|
|
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
2025-02-20 12:53:51 +00:00
|
|
|
- python3 offline_inference/basic/classify.py
|
|
|
|
- python3 offline_inference/basic/embed.py
|
|
|
|
- python3 offline_inference/basic/score.py
|
2025-03-15 01:02:20 -04:00
|
|
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
2024-03-28 14:36:10 -07:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Prefix Caching Test # 9min
|
2024-12-11 17:23:37 +02:00
|
|
|
mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/prefix_caching
|
2024-01-17 16:32:10 -08:00
|
|
|
commands:
|
2024-10-17 11:38:15 -05:00
|
|
|
- pytest -v -s prefix_caching
|
2024-01-17 16:32:10 -08:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Samplers Test # 36min
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/model_executor/layers
|
|
|
|
- vllm/sampling_metadata.py
|
|
|
|
- tests/samplers
|
2025-01-10 20:45:33 +08:00
|
|
|
- tests/conftest.py
|
2024-08-19 11:24:03 +08:00
|
|
|
commands:
|
|
|
|
- pytest -v -s samplers
|
|
|
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
2024-01-14 12:37:58 -08:00
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: LogitsProcessor Test # 5min
|
2024-05-02 14:29:07 -05:00
|
|
|
mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/model_executor/layers
|
2024-12-18 03:34:08 -03:00
|
|
|
- vllm/model_executor/guided_decoding
|
2024-08-05 17:39:22 -07:00
|
|
|
- tests/test_logits_processor
|
2024-12-18 03:34:08 -03:00
|
|
|
- tests/model_executor/test_guided_processors
|
2025-02-19 01:06:23 -08:00
|
|
|
commands:
|
2024-12-18 03:34:08 -03:00
|
|
|
- pytest -v -s test_logits_processor.py
|
|
|
|
- pytest -v -s model_executor/test_guided_processors.py
|
2024-03-21 07:25:01 +08:00
|
|
|
|
2025-01-13 15:50:35 +09:00
|
|
|
- label: Speculative decoding tests # 40min
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/spec_decode
|
|
|
|
- tests/spec_decode
|
2025-01-13 15:50:35 +09:00
|
|
|
- vllm/model_executor/models/eagle.py
|
2024-06-05 17:49:27 -05:00
|
|
|
commands:
|
2024-09-11 14:07:34 -07:00
|
|
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
2025-02-19 01:06:23 -08:00
|
|
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
2025-01-13 15:50:35 +09:00
|
|
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
2024-03-08 23:32:46 -08:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: LoRA Test %N # 15min each
|
2025-04-03 13:05:17 -05:00
|
|
|
#mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/lora
|
|
|
|
- tests/lora
|
2025-04-02 16:39:09 +08:00
|
|
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
2024-08-05 17:39:22 -07:00
|
|
|
parallelism: 4
|
|
|
|
|
2025-04-09 02:46:45 -04:00
|
|
|
- label: PyTorch Compilation Unit Tests
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/compile
|
|
|
|
commands:
|
|
|
|
- pytest -v -s compile/test_pass_manager.py
|
|
|
|
- pytest -v -s compile/test_fusion.py
|
|
|
|
|
2025-02-22 19:19:45 -08:00
|
|
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
2024-09-25 10:35:52 -04:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/compile
|
|
|
|
commands:
|
2024-10-10 12:39:36 -07:00
|
|
|
- pytest -v -s compile/test_basic_correctness.py
|
2024-10-29 23:03:49 -07:00
|
|
|
# these tests need to be separated, cannot combine
|
|
|
|
- pytest -v -s compile/piecewise/test_simple.py
|
|
|
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
2024-09-25 10:35:52 -04:00
|
|
|
|
2025-02-22 19:19:45 -08:00
|
|
|
- label: PyTorch Fullgraph Test # 18min
|
2024-10-17 15:08:34 -04:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/compile
|
|
|
|
commands:
|
|
|
|
- pytest -v -s compile/test_full_graph.py
|
2024-09-25 10:35:52 -04:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Kernels Test %N # 1h each
|
2025-04-03 13:05:17 -05:00
|
|
|
# mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- csrc/
|
|
|
|
- vllm/attention
|
|
|
|
- tests/kernels
|
|
|
|
commands:
|
|
|
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
|
|
parallelism: 4
|
|
|
|
|
|
|
|
- label: Tensorizer Test # 11min
|
2025-04-03 13:05:17 -05:00
|
|
|
# mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
soft_fail: true
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/model_executor/model_loader
|
|
|
|
- tests/tensorizer_loader
|
2024-07-12 21:51:48 -07:00
|
|
|
commands:
|
2024-08-27 13:09:13 -04:00
|
|
|
- apt-get update && apt-get install -y curl libsodium23
|
2024-07-12 21:51:48 -07:00
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
|
|
- pytest -v -s tensorizer_loader
|
2024-04-13 20:13:01 -04:00
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: Benchmarks # 9min
|
2024-01-14 12:37:58 -08:00
|
|
|
working_dir: "/vllm-workspace/.buildkite"
|
2024-05-02 14:29:07 -05:00
|
|
|
mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- benchmarks/
|
2024-01-14 12:37:58 -08:00
|
|
|
commands:
|
2025-04-04 12:16:20 -07:00
|
|
|
- bash scripts/run-benchmarks.sh
|
2024-02-12 22:53:07 -08:00
|
|
|
|
2025-04-15 00:10:35 -06:00
|
|
|
- label: Benchmarks CLI Test # 10min
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/benchmarks/
|
|
|
|
commands:
|
|
|
|
- pytest -v -s benchmarks/
|
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Quantization Test # 33min
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- csrc/
|
|
|
|
- vllm/model_executor/layers/quantization
|
|
|
|
- tests/quantization
|
2024-10-09 00:38:40 -07:00
|
|
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
2024-08-05 17:39:22 -07:00
|
|
|
|
|
|
|
- label: LM Eval Small Models # 53min
|
2024-06-29 13:04:30 -04:00
|
|
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- csrc/
|
|
|
|
- vllm/model_executor/layers/quantization
|
2024-06-29 13:04:30 -04:00
|
|
|
commands:
|
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
|
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
|
|
|
|
2025-02-13 16:23:45 +01:00
|
|
|
- label: OpenAI API correctness
|
|
|
|
source_file_dependencies:
|
|
|
|
- csrc/
|
|
|
|
- vllm/entrypoints/openai/
|
|
|
|
- vllm/model_executor/models/whisper.py
|
|
|
|
commands: # LMEval+Transcription WER check
|
|
|
|
- pytest -s entrypoints/openai/correctness/
|
|
|
|
|
2024-09-17 07:35:01 -07:00
|
|
|
- label: Encoder Decoder tests # 5min
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/encoder_decoder
|
|
|
|
commands:
|
|
|
|
- pytest -v -s encoder_decoder
|
|
|
|
|
2024-09-04 15:18:13 -05:00
|
|
|
- label: OpenAI-Compatible Tool Use # 20 min
|
|
|
|
fast_check: false
|
2025-04-03 13:05:17 -05:00
|
|
|
#mirror_hardwares: [ amd ]
|
2024-09-04 15:18:13 -05:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/tool_use
|
2025-04-11 18:52:38 -06:00
|
|
|
- tests/mistral_tool_use
|
2024-09-04 15:18:13 -05:00
|
|
|
commands:
|
|
|
|
- pytest -v -s tool_use
|
2025-04-11 18:52:38 -06:00
|
|
|
- pytest -v -s mistral_tool_use
|
2024-09-04 15:18:13 -05:00
|
|
|
|
2024-09-14 01:20:06 +08:00
|
|
|
##### models test #####
|
|
|
|
|
2024-12-12 06:18:16 +08:00
|
|
|
- label: Basic Models Test # 24min
|
2024-09-14 01:20:06 +08:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/models
|
|
|
|
commands:
|
2025-02-03 14:30:38 +01:00
|
|
|
- pytest -v -s models/test_transformers.py
|
2024-11-15 08:55:54 +08:00
|
|
|
- pytest -v -s models/test_registry.py
|
2025-03-15 01:02:20 -04:00
|
|
|
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
2025-04-16 11:31:30 +09:00
|
|
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
2025-04-07 08:06:27 -07:00
|
|
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
2025-04-16 11:31:30 +09:00
|
|
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
2024-09-14 01:20:06 +08:00
|
|
|
|
2024-12-12 06:18:16 +08:00
|
|
|
- label: Language Models Test (Standard) # 32min
|
2024-09-14 01:20:06 +08:00
|
|
|
#mirror_hardwares: [amd]
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/models/decoder_only/language
|
2024-11-15 12:23:09 +08:00
|
|
|
- tests/models/embedding/language
|
|
|
|
- tests/models/encoder_decoder/language
|
2024-09-14 01:20:06 +08:00
|
|
|
commands:
|
2025-04-16 11:31:30 +09:00
|
|
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
|
|
|
- pip install causal-conv1d
|
2024-11-15 12:23:09 +08:00
|
|
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
|
|
|
- pytest -v -s models/embedding/language -m core_model
|
2024-10-21 12:27:50 +08:00
|
|
|
|
2024-12-12 06:18:16 +08:00
|
|
|
- label: Language Models Test (Extended) # 1h10min
|
2024-11-19 19:36:03 -10:00
|
|
|
optional: true
|
2024-10-21 12:27:50 +08:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/models/decoder_only/language
|
2024-11-15 12:23:09 +08:00
|
|
|
- tests/models/embedding/language
|
|
|
|
- tests/models/encoder_decoder/language
|
2024-10-21 12:27:50 +08:00
|
|
|
commands:
|
2025-04-16 11:31:30 +09:00
|
|
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
|
|
|
- pip install causal-conv1d
|
2024-11-10 03:39:14 +08:00
|
|
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
2024-11-15 12:23:09 +08:00
|
|
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
2024-09-14 01:20:06 +08:00
|
|
|
|
2024-12-30 23:01:35 +08:00
|
|
|
- label: Multi-Modal Models Test (Standard) # 40min
|
2024-09-14 01:20:06 +08:00
|
|
|
#mirror_hardwares: [amd]
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/models/decoder_only/audio_language
|
|
|
|
- tests/models/decoder_only/vision_language
|
2024-11-15 12:23:09 +08:00
|
|
|
- tests/models/embedding/vision_language
|
2025-01-03 03:39:19 -05:00
|
|
|
- tests/models/encoder_decoder/audio_language
|
2024-11-15 12:23:09 +08:00
|
|
|
- tests/models/encoder_decoder/vision_language
|
2024-09-14 01:20:06 +08:00
|
|
|
commands:
|
2024-12-08 01:10:05 +08:00
|
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
2025-01-11 13:50:05 +08:00
|
|
|
- pytest -v -s models/multimodal
|
2024-11-15 12:23:09 +08:00
|
|
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
2025-04-10 14:03:33 +08:00
|
|
|
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
|
2024-12-01 08:02:54 +08:00
|
|
|
- pytest -v -s models/embedding/vision_language -m core_model
|
2025-01-03 03:39:19 -05:00
|
|
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
2024-11-15 12:23:09 +08:00
|
|
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
|
|
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
2025-03-29 06:30:09 -07:00
|
|
|
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
|
2024-10-31 10:10:52 -06:00
|
|
|
|
2024-12-30 23:01:35 +08:00
|
|
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
2024-11-19 19:36:03 -10:00
|
|
|
optional: true
|
2024-10-31 10:10:52 -06:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/models/decoder_only/audio_language
|
|
|
|
- tests/models/decoder_only/vision_language
|
2024-11-15 12:23:09 +08:00
|
|
|
- tests/models/embedding/vision_language
|
|
|
|
- tests/models/encoder_decoder/vision_language
|
2024-10-31 10:10:52 -06:00
|
|
|
commands:
|
2024-12-08 01:10:05 +08:00
|
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
2024-11-10 03:39:14 +08:00
|
|
|
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
2024-12-12 06:18:16 +08:00
|
|
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
2025-04-10 14:03:33 +08:00
|
|
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
2024-12-01 08:02:54 +08:00
|
|
|
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
2024-11-15 12:23:09 +08:00
|
|
|
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
|
|
|
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
2024-09-29 00:54:35 +08:00
|
|
|
|
2024-12-12 06:18:16 +08:00
|
|
|
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
|
|
|
optional: true
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/models/decoder_only/vision_language
|
|
|
|
commands:
|
|
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
|
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
|
|
|
|
2024-10-10 15:45:09 -07:00
|
|
|
# This test is used only in PR development phase to test individual models and should never run on main
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Custom Models Test
|
2025-04-03 13:05:17 -05:00
|
|
|
mirror_hardwares: [amd]
|
2024-09-29 00:54:35 +08:00
|
|
|
optional: true
|
|
|
|
commands:
|
2024-10-10 15:45:09 -07:00
|
|
|
- echo 'Testing custom models...'
|
2024-09-29 00:54:35 +08:00
|
|
|
# PR authors can temporarily add commands below to test individual models
|
|
|
|
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
|
|
|
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
2024-09-14 01:20:06 +08:00
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
##### 1 GPU test #####
|
|
|
|
##### multi gpus test #####
|
|
|
|
|
|
|
|
- label: Distributed Comm Ops Test # 7min
|
2025-04-03 13:05:17 -05:00
|
|
|
mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
num_gpus: 2
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/distributed
|
|
|
|
- tests/distributed
|
|
|
|
commands:
|
|
|
|
- pytest -v -s distributed/test_comm_ops.py
|
|
|
|
- pytest -v -s distributed/test_shm_broadcast.py
|
|
|
|
|
|
|
|
- label: 2 Node Tests (4 GPUs in total) # 16min
|
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
num_gpus: 2
|
|
|
|
num_nodes: 2
|
|
|
|
source_file_dependencies:
|
2024-08-22 00:54:15 -07:00
|
|
|
- vllm/distributed/
|
|
|
|
- vllm/engine/
|
|
|
|
- vllm/executor/
|
|
|
|
- vllm/model_executor/models/
|
|
|
|
- tests/distributed/
|
2024-08-05 17:39:22 -07:00
|
|
|
commands:
|
|
|
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
2024-12-12 01:04:19 -08:00
|
|
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
2024-08-22 17:44:25 -07:00
|
|
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
2024-08-05 17:39:22 -07:00
|
|
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
|
|
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
2024-12-12 01:04:19 -08:00
|
|
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
2024-08-05 17:39:22 -07:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Distributed Tests (2 GPUs) # 40min
|
2024-08-16 22:25:32 -05:00
|
|
|
#mirror_hardwares: [amd]
|
2024-08-05 17:39:22 -07:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
num_gpus: 2
|
|
|
|
source_file_dependencies:
|
2024-08-22 00:54:15 -07:00
|
|
|
- vllm/distributed/
|
|
|
|
- vllm/engine/
|
|
|
|
- vllm/executor/
|
|
|
|
- vllm/model_executor/models/
|
|
|
|
- tests/distributed/
|
2024-09-16 22:57:57 -07:00
|
|
|
- vllm/compilation
|
2024-12-01 19:01:00 -06:00
|
|
|
- vllm/worker/worker_base.py
|
|
|
|
- vllm/worker/worker.py
|
|
|
|
- vllm/worker/model_runner.py
|
2025-01-17 20:47:01 +08:00
|
|
|
- entrypoints/llm/test_collective_rpc.py
|
2025-03-27 16:14:41 -07:00
|
|
|
- tests/v1/test_async_llm_dp.py
|
|
|
|
- vllm/v1/engine/
|
2024-08-05 17:39:22 -07:00
|
|
|
commands:
|
2025-03-27 16:14:41 -07:00
|
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
2025-03-29 05:39:14 -05:00
|
|
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
2024-10-10 12:39:36 -07:00
|
|
|
- pytest -v -s ./compile/test_basic_correctness.py
|
2024-09-16 22:57:57 -07:00
|
|
|
- pytest -v -s ./compile/test_wrapper.py
|
2024-12-12 01:04:19 -08:00
|
|
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
2024-12-12 06:18:16 +08:00
|
|
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
2024-09-14 01:20:06 +08:00
|
|
|
# Avoid importing model tests that cause CUDA reinitialization error
|
2025-02-03 14:30:38 +01:00
|
|
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
2024-12-12 06:18:16 +08:00
|
|
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
|
|
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
|
|
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
2025-01-21 13:25:03 +08:00
|
|
|
# this test fails consistently.
|
|
|
|
# TODO: investigate and fix
|
|
|
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
2025-03-15 01:02:20 -04:00
|
|
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
2025-03-17 11:35:57 +08:00
|
|
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
2025-04-16 22:48:34 -04:00
|
|
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
2024-08-05 17:39:22 -07:00
|
|
|
|
2024-12-30 20:24:45 +08:00
|
|
|
- label: Plugin Tests (2 GPUs) # 40min
|
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
num_gpus: 2
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/plugins/
|
|
|
|
- tests/plugins/
|
|
|
|
commands:
|
|
|
|
# begin platform plugin tests, all the code in-between runs on dummy platform
|
|
|
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
|
|
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
|
|
|
- pip uninstall vllm_add_dummy_platform -y
|
|
|
|
# end platform plugin tests
|
|
|
|
# other tests continue here:
|
2025-02-19 10:16:38 +01:00
|
|
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
2024-12-30 20:24:45 +08:00
|
|
|
- pip install -e ./plugins/vllm_add_dummy_model
|
|
|
|
- pytest -v -s distributed/test_distributed_oot.py
|
|
|
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
|
|
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Multi-step Tests (4 GPUs) # 36min
|
2024-08-19 13:52:13 -07:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
num_gpus: 4
|
|
|
|
source_file_dependencies:
|
2024-08-21 15:52:40 -07:00
|
|
|
- vllm/model_executor/layers/sampler.py
|
|
|
|
- vllm/sequence.py
|
|
|
|
- vllm/worker/worker_base.py
|
|
|
|
- vllm/worker/worker.py
|
|
|
|
- vllm/worker/multi_step_worker.py
|
|
|
|
- vllm/worker/model_runner_base.py
|
|
|
|
- vllm/worker/model_runner.py
|
|
|
|
- vllm/worker/multi_step_model_runner.py
|
|
|
|
- vllm/engine
|
|
|
|
- tests/multi_step
|
2024-08-19 13:52:13 -07:00
|
|
|
commands:
|
2025-01-21 13:25:03 +08:00
|
|
|
# this test is quite flaky
|
|
|
|
# TODO: investigate and fix.
|
|
|
|
# - pytest -v -s multi_step/test_correctness_async_llm.py
|
2024-08-23 15:45:53 -04:00
|
|
|
- pytest -v -s multi_step/test_correctness_llm.py
|
2024-08-19 13:52:13 -07:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Pipeline Parallelism Test # 45min
|
2024-08-05 17:39:22 -07:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
2024-07-01 12:40:45 -04:00
|
|
|
num_gpus: 4
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
2024-08-22 00:54:15 -07:00
|
|
|
- vllm/distributed/
|
|
|
|
- vllm/engine/
|
|
|
|
- vllm/executor/
|
|
|
|
- vllm/model_executor/models/
|
|
|
|
- tests/distributed/
|
2024-07-01 12:40:45 -04:00
|
|
|
commands:
|
2024-08-15 19:39:04 -07:00
|
|
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
2024-08-16 20:49:30 -07:00
|
|
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
2024-07-01 12:40:45 -04:00
|
|
|
|
2024-11-24 09:23:17 +08:00
|
|
|
- label: LoRA TP Test (Distributed)
|
2024-08-05 17:39:22 -07:00
|
|
|
num_gpus: 4
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/lora
|
2024-11-24 09:23:17 +08:00
|
|
|
- tests/lora
|
2024-02-12 22:53:07 -08:00
|
|
|
commands:
|
2024-08-05 17:39:22 -07:00
|
|
|
# FIXIT: find out which code initialize cuda before running the test
|
|
|
|
# before the fix, we need to use spawn to test it
|
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
2025-02-19 01:06:23 -08:00
|
|
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
2024-11-24 09:23:17 +08:00
|
|
|
# requires multi-GPU testing for validation.
|
|
|
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
|
|
|
- pytest -v -s -x lora/test_llama_tp.py
|
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
|
2024-09-29 00:54:35 +08:00
|
|
|
- label: Weight Loading Multiple GPU Test # 33min
|
2024-08-13 14:30:11 -04:00
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
num_gpus: 2
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/weight_loading
|
|
|
|
commands:
|
2024-09-09 23:02:52 -04:00
|
|
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
|
|
|
|
|
|
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
|
|
|
working_dir: "/vllm-workspace/tests"
|
|
|
|
num_gpus: 2
|
|
|
|
gpu: a100
|
|
|
|
optional: true
|
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
|
|
|
- tests/weight_loading
|
|
|
|
commands:
|
2025-02-19 01:06:23 -08:00
|
|
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
2024-08-13 14:30:11 -04:00
|
|
|
|
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
##### multi gpus test #####
|
|
|
|
##### A100 test #####
|
2024-06-19 07:42:13 -07:00
|
|
|
|
2024-08-05 17:39:22 -07:00
|
|
|
- label: Distributed Tests (A100) # optional
|
2024-06-19 07:42:13 -07:00
|
|
|
gpu: a100
|
2024-11-19 19:36:03 -10:00
|
|
|
optional: true
|
2024-06-19 16:30:03 -07:00
|
|
|
num_gpus: 4
|
2024-08-05 17:39:22 -07:00
|
|
|
source_file_dependencies:
|
|
|
|
- vllm/
|
2025-02-19 01:06:23 -08:00
|
|
|
commands:
|
2024-06-19 13:16:04 -07:00
|
|
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
|
|
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
|
|
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
2024-11-05 22:35:03 -08:00
|
|
|
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
2024-12-12 06:18:16 +08:00
|
|
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
2024-07-01 02:11:15 +09:00
|
|
|
- pytest -v -s -x lora/test_mixtral.py
|
2024-08-05 17:39:22 -07:00
|
|
|
|
|
|
|
- label: LM Eval Large Models # optional
|
|
|
|
gpu: a100
|
2024-11-19 19:36:03 -10:00
|
|
|
optional: true
|
2024-08-05 17:39:22 -07:00
|
|
|
num_gpus: 4
|
|
|
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
|
|
source_file_dependencies:
|
|
|
|
- csrc/
|
|
|
|
- vllm/model_executor/layers/quantization
|
|
|
|
commands:
|
|
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
|
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|