[CI/Build] Split up VLM tests (#11083)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
72ff3a9686
commit
d1e21a979b
@ -321,7 +321,7 @@ steps:
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 30min
|
||||
- label: Basic Models Test # 24min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
@ -331,7 +331,7 @@ steps:
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
|
||||
- label: Language Models Test (Standard) # 42min
|
||||
- label: Language Models Test (Standard) # 32min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -342,7 +342,7 @@ steps:
|
||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/language -m core_model
|
||||
|
||||
- label: Language Models Test (Extended) # 50min
|
||||
- label: Language Models Test (Extended) # 1h10min
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -353,7 +353,7 @@ steps:
|
||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 26min
|
||||
- label: Multi-Modal Models Test (Standard) # 28min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -369,7 +369,7 @@ steps:
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) # 1h15m
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 1h16m
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -380,14 +380,24 @@ steps:
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
optional: true
|
||||
@ -446,11 +456,11 @@ steps:
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
@ -540,7 +550,7 @@ steps:
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
|
@ -96,7 +96,8 @@ markers = [
|
||||
"core_model: enable this model test in each PR instead of only nightly",
|
||||
"cpu_model: enable this model test in CPU tests",
|
||||
"quant_model: run this model test under Quantized category",
|
||||
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
|
||||
"split: run this test as part of a split",
|
||||
"distributed: run this test only in distributed GPU tests",
|
||||
"skip_v1: do not run this test with v1",
|
||||
"optional: optional tests that are automatically skipped, include --optional to run them",
|
||||
]
|
||||
|
@ -1,7 +1,9 @@
|
||||
"""Common tests for testing .generate() functionality for single / multiple
|
||||
image, embedding, and video support for different VLMs in vLLM.
|
||||
"""
|
||||
import math
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from pathlib import PosixPath
|
||||
from typing import Type
|
||||
|
||||
@ -10,11 +12,12 @@ from transformers import AutoModelForVision2Seq
|
||||
from transformers.utils import is_flash_attn_2_available
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import cuda_device_count_stateless, identity
|
||||
from vllm.utils import identity
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
||||
_VideoAssets)
|
||||
from ....utils import fork_new_process_for_each_test, large_gpu_mark
|
||||
from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
|
||||
multi_gpu_marks)
|
||||
from ...utils import check_outputs_equal
|
||||
from .vlm_utils import custom_inputs, model_utils, runners
|
||||
from .vlm_utils.case_filtering import get_parametrized_options
|
||||
@ -382,7 +385,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
|
||||
),
|
||||
### Tensor parallel / multi-gpu broadcast tests
|
||||
"broadcast-chameleon": VLMTestInfo(
|
||||
"chameleon-broadcast": VLMTestInfo(
|
||||
models=["facebook/chameleon-7b"],
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
@ -393,43 +396,25 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||
comparator=check_outputs_equal,
|
||||
marks=[
|
||||
pytest.mark.distributed_2_gpus,
|
||||
pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.",
|
||||
),
|
||||
],
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
),
|
||||
"broadcast-llava": VLMTestInfo(
|
||||
"llava-broadcast": VLMTestInfo(
|
||||
models=["llava-hf/llava-1.5-7b-hf"],
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=[
|
||||
pytest.mark.distributed_2_gpus,
|
||||
pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.",
|
||||
)
|
||||
],
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
),
|
||||
"broadcast-llava_next": VLMTestInfo(
|
||||
"llava_next-broadcast": VLMTestInfo(
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||
max_model_len=10240,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=[
|
||||
pytest.mark.distributed_2_gpus,
|
||||
pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.",
|
||||
)
|
||||
],
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
),
|
||||
### Custom input edge-cases for specific models
|
||||
@ -468,6 +453,41 @@ VLM_TEST_SETTINGS = {
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def _mark_splits(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
*,
|
||||
num_groups: int,
|
||||
) -> dict[str, VLMTestInfo]:
|
||||
name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
|
||||
test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
|
||||
|
||||
for info in test_settings.values():
|
||||
for model in info.models:
|
||||
test_infos_by_model[model].append(info)
|
||||
|
||||
models = sorted(test_infos_by_model.keys())
|
||||
split_size = math.ceil(len(models) / num_groups)
|
||||
|
||||
new_test_settings = dict[str, VLMTestInfo]()
|
||||
|
||||
for i in range(num_groups):
|
||||
models_in_group = models[i * split_size:(i + 1) * split_size]
|
||||
|
||||
for model in models_in_group:
|
||||
for info in test_infos_by_model[model]:
|
||||
new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
|
||||
new_info = info._replace(marks=new_marks)
|
||||
new_test_settings[name_by_test_info_id[id(info)]] = new_info
|
||||
|
||||
missing_keys = test_settings.keys() - new_test_settings.keys()
|
||||
assert not missing_keys, f"Missing keys: {missing_keys}"
|
||||
|
||||
return new_test_settings
|
||||
|
||||
|
||||
VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
||||
|
||||
|
||||
### Test wrappers
|
||||
# Wrappers around the core test running func for:
|
||||
# - single image
|
||||
|
@ -682,10 +682,12 @@ def fork_new_process_for_each_test(
|
||||
|
||||
|
||||
def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||
"""Gets a pytest skipif mark, which triggers ig the the device doesn't have
|
||||
meet a minimum memory requirement in gb; can be leveraged via
|
||||
@large_gpu_test to skip tests in environments without enough resources, or
|
||||
called when filtering tests to run directly.
|
||||
"""
|
||||
Get a pytest mark, which skips the test if the GPU doesn't meet
|
||||
a minimum memory requirement in GB.
|
||||
|
||||
This can be leveraged via `@large_gpu_test` to skip tests in environments
|
||||
without enough resources, or called when filtering tests to run directly.
|
||||
"""
|
||||
try:
|
||||
if current_platform.is_cpu():
|
||||
@ -712,26 +714,37 @@ def large_gpu_test(*, min_gb: int):
|
||||
|
||||
Currently, the CI machine uses L4 GPU which has 24 GB VRAM.
|
||||
"""
|
||||
test_skipif = large_gpu_mark(min_gb)
|
||||
mark = large_gpu_mark(min_gb)
|
||||
|
||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
return test_skipif(f)
|
||||
return mark(f)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def multi_gpu_marks(*, num_gpus: int):
|
||||
"""Get a collection of pytest marks to apply for `@multi_gpu_test`."""
|
||||
test_selector = pytest.mark.distributed(num_gpus=num_gpus)
|
||||
test_skipif = pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < num_gpus,
|
||||
reason=f"Need at least {num_gpus} GPUs to run the test.",
|
||||
)
|
||||
|
||||
return [test_selector, test_skipif]
|
||||
|
||||
|
||||
def multi_gpu_test(*, num_gpus: int):
|
||||
"""
|
||||
Decorate a test to be run only when multiple GPUs are available.
|
||||
"""
|
||||
test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
|
||||
test_skipif = pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < num_gpus,
|
||||
reason=f"Need at least {num_gpus} GPUs to run the test.",
|
||||
)
|
||||
marks = multi_gpu_marks(num_gpus=num_gpus)
|
||||
|
||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
return test_selector(test_skipif(fork_new_process_for_each_test(f)))
|
||||
func = fork_new_process_for_each_test(f)
|
||||
for mark in reversed(marks):
|
||||
func = mark(func)
|
||||
|
||||
return func
|
||||
|
||||
return wrapper
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user