2025-02-02 14:58:18 -05:00
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2024-10-30 10:32:17 -06:00
|
|
|
|
"""Common tests for testing .generate() functionality for single / multiple
|
|
|
|
|
image, embedding, and video support for different VLMs in vLLM.
|
|
|
|
|
"""
|
2024-12-12 06:18:16 +08:00
|
|
|
|
import math
|
2024-10-30 10:32:17 -06:00
|
|
|
|
import os
|
2024-12-12 06:18:16 +08:00
|
|
|
|
from collections import defaultdict
|
2024-10-30 10:32:17 -06:00
|
|
|
|
from pathlib import PosixPath
|
|
|
|
|
|
|
|
|
|
import pytest
|
2025-03-18 02:35:17 +08:00
|
|
|
|
from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
|
2024-10-30 10:32:17 -06:00
|
|
|
|
|
|
|
|
|
from vllm.platforms import current_platform
|
2024-12-12 06:18:16 +08:00
|
|
|
|
from vllm.utils import identity
|
2024-10-30 10:32:17 -06:00
|
|
|
|
|
|
|
|
|
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
|
|
|
|
_VideoAssets)
|
2025-03-17 19:33:35 +08:00
|
|
|
|
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
|
2024-12-12 06:18:16 +08:00
|
|
|
|
multi_gpu_marks)
|
2024-10-30 10:32:17 -06:00
|
|
|
|
from ...utils import check_outputs_equal
|
|
|
|
|
from .vlm_utils import custom_inputs, model_utils, runners
|
|
|
|
|
from .vlm_utils.case_filtering import get_parametrized_options
|
|
|
|
|
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
|
|
|
|
|
VLMTestInfo, VLMTestType)
|
|
|
|
|
|
|
|
|
|
# This hack is needed for phi3v & paligemma models
|
|
|
|
|
# ROCm Triton FA can run into shared memory issues with these models,
|
|
|
|
|
# use other backends in the meantime
|
|
|
|
|
# FIXME (mattwong, gshtrasb, hongxiayan)
|
|
|
|
|
if current_platform.is_rocm():
|
|
|
|
|
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
|
|
|
|
|
2025-03-15 01:02:20 -04:00
|
|
|
|
REQUIRES_V0_MODELS = [
|
|
|
|
|
# V1 Test: no way to fall back for head_dim = 80
|
|
|
|
|
# https://github.com/vllm-project/vllm/issues/14524
|
|
|
|
|
"qwen_vl",
|
|
|
|
|
# V1 Test: not enough KV cache space in C1.
|
|
|
|
|
"fuyu",
|
|
|
|
|
]
|
|
|
|
|
|
2024-10-30 10:32:17 -06:00
|
|
|
|
# yapf: disable
|
|
|
|
|
COMMON_BROADCAST_SETTINGS = {
|
|
|
|
|
"test_type": VLMTestType.IMAGE,
|
|
|
|
|
"dtype": "half",
|
|
|
|
|
"max_tokens": 5,
|
|
|
|
|
"tensor_parallel_size": 2,
|
2024-12-08 01:10:05 +08:00
|
|
|
|
"hf_model_kwargs": {"device_map": "auto"},
|
2024-10-30 10:32:17 -06:00
|
|
|
|
"image_size_factors": [(.25, 0.5, 1.0)],
|
|
|
|
|
"distributed_executor_backend": (
|
|
|
|
|
"ray",
|
|
|
|
|
"mp",
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
### Test configuration for specific models
|
|
|
|
|
# NOTE: The convention of the test settings below is to lead each test key
|
|
|
|
|
# with the name of the model arch used in the test, using underscores in place
|
|
|
|
|
# of hyphens; this makes it more convenient to filter tests for a specific kind
|
|
|
|
|
# of model. For example....
|
|
|
|
|
#
|
|
|
|
|
# To run all test types for a specific key:
|
|
|
|
|
# use the k flag to substring match with a leading square bracket; if the
|
|
|
|
|
# model arch happens to be a substring of another one, you can add a
|
|
|
|
|
# trailing hyphen. E.g.,
|
|
|
|
|
# - pytest $TEST_FILE -k "[llava-"
|
|
|
|
|
# prevents matching on "[llava_next-" & will match just the enabled cases
|
|
|
|
|
# for llava, i.e., single image, image embedding, and custom input tests.
|
|
|
|
|
#
|
|
|
|
|
# To run a test for a Test Info for just one of multiple models:
|
|
|
|
|
# use the k flag to substring match the model name, e.g.,
|
|
|
|
|
# - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
|
|
|
|
|
# prevents matching on nGVLab/InternVL2-2B.
|
|
|
|
|
#
|
|
|
|
|
# You can also combine substrings to match more granularly.
|
|
|
|
|
# ex 1:
|
|
|
|
|
# pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
|
|
|
|
|
# will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
|
|
|
|
|
# match both wrappers for single image tests, since it also matches
|
|
|
|
|
# test_single_image_heavy (which forks if we have a distributed backend)
|
|
|
|
|
# ex 2:
|
|
|
|
|
# pytest $TEST_FILE -k "[llava- or [intern_vl-"
|
|
|
|
|
# will run all of the tests for only llava & internvl.
|
|
|
|
|
#
|
|
|
|
|
# NOTE you can add --collect-only to any of the above commands to see
|
|
|
|
|
# which cases would be selected and deselected by pytest. In general,
|
|
|
|
|
# this is a good idea for checking your command first, since tests are slow.
|
|
|
|
|
|
|
|
|
|
VLM_TEST_SETTINGS = {
|
2024-10-31 10:10:52 -06:00
|
|
|
|
#### Core tests to always run in the CI
|
|
|
|
|
"llava": VLMTestInfo(
|
|
|
|
|
models=["llava-hf/llava-1.5-7b-hf"],
|
|
|
|
|
test_type=(
|
|
|
|
|
VLMTestType.EMBEDDING,
|
|
|
|
|
VLMTestType.IMAGE,
|
|
|
|
|
VLMTestType.CUSTOM_INPUTS
|
|
|
|
|
),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
|
|
|
|
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
|
|
|
|
max_model_len=4096,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-31 10:10:52 -06:00
|
|
|
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
|
|
|
|
custom_test_opts=[CustomTestOptions(
|
|
|
|
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
|
|
|
|
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
|
|
|
|
|
),
|
|
|
|
|
limit_mm_per_prompt={"image": 4},
|
|
|
|
|
)],
|
2024-11-08 23:30:04 +08:00
|
|
|
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
2024-10-31 10:10:52 -06:00
|
|
|
|
),
|
|
|
|
|
"paligemma": VLMTestInfo(
|
|
|
|
|
models=["google/paligemma-3b-mix-224"],
|
|
|
|
|
test_type=VLMTestType.IMAGE,
|
|
|
|
|
prompt_formatter=identity,
|
|
|
|
|
img_idx_to_prompt = lambda idx: "",
|
|
|
|
|
# Paligemma uses its own sample prompts because the default one fails
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "caption es",
|
|
|
|
|
"cherry_blossom": "What is in the picture?",
|
|
|
|
|
}),
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-31 10:10:52 -06:00
|
|
|
|
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
2025-03-06 02:31:38 -06:00
|
|
|
|
dtype="bfloat16",
|
|
|
|
|
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
|
2024-10-31 10:10:52 -06:00
|
|
|
|
),
|
2025-02-05 13:31:38 -08:00
|
|
|
|
"qwen2_5_vl": VLMTestInfo(
|
|
|
|
|
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
|
|
|
|
test_type=(
|
|
|
|
|
VLMTestType.IMAGE,
|
|
|
|
|
VLMTestType.MULTI_IMAGE,
|
|
|
|
|
VLMTestType.VIDEO
|
|
|
|
|
),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
|
|
|
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
auto_cls=AutoModelForVision2Seq,
|
|
|
|
|
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
|
|
|
|
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
2025-02-13 22:19:15 +08:00
|
|
|
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
2025-02-05 13:31:38 -08:00
|
|
|
|
),
|
2024-10-31 10:10:52 -06:00
|
|
|
|
#### Extended model tests
|
2025-03-25 18:22:52 +08:00
|
|
|
|
"aria": VLMTestInfo(
|
|
|
|
|
models=["rhymes-ai/Aria"],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
|
|
|
|
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
|
|
|
|
}),
|
|
|
|
|
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
|
|
|
|
stop_str=["<|im_end|>"],
|
|
|
|
|
image_size_factors=[(0.10, 0.15)],
|
|
|
|
|
max_tokens=64,
|
|
|
|
|
marks=[large_gpu_mark(min_gb=64)],
|
|
|
|
|
),
|
2025-04-01 09:30:43 -07:00
|
|
|
|
"aya_vision": VLMTestInfo(
|
|
|
|
|
models=["CohereForAI/aya-vision-8b"],
|
2025-04-07 08:06:27 -07:00
|
|
|
|
test_type=(VLMTestType.IMAGE),
|
2025-04-01 09:30:43 -07:00
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
|
|
|
|
}),
|
|
|
|
|
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
2025-04-07 08:06:27 -07:00
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
|
|
|
|
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
|
|
|
|
|
),
|
|
|
|
|
"aya_vision-multi_image": VLMTestInfo(
|
|
|
|
|
models=["CohereForAI/aya-vision-8b"],
|
|
|
|
|
test_type=(VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
|
|
|
|
}),
|
|
|
|
|
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
2025-04-01 09:30:43 -07:00
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2025-04-07 08:06:27 -07:00
|
|
|
|
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
|
|
|
|
|
marks=[large_gpu_mark(min_gb=32)],
|
2025-04-01 09:30:43 -07:00
|
|
|
|
),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
"blip2": VLMTestInfo(
|
2025-04-01 00:01:35 +08:00
|
|
|
|
# TODO: Change back to 2.7b once head_dim = 80 is supported
|
|
|
|
|
models=["Salesforce/blip2-opt-6.7b"],
|
2024-10-30 10:32:17 -06:00
|
|
|
|
test_type=VLMTestType.IMAGE,
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
|
|
|
|
img_idx_to_prompt=lambda idx: "",
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
|
|
|
|
),
|
|
|
|
|
"chameleon": VLMTestInfo(
|
|
|
|
|
models=["facebook/chameleon-7b"],
|
|
|
|
|
test_type=VLMTestType.IMAGE,
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
|
|
|
|
max_model_len=4096,
|
2024-12-31 13:17:22 -08:00
|
|
|
|
max_num_seqs=2,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
# For chameleon, we only compare the sequences
|
|
|
|
|
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
|
|
|
|
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
|
|
|
|
comparator=check_outputs_equal,
|
|
|
|
|
max_tokens=8,
|
|
|
|
|
dtype="bfloat16",
|
|
|
|
|
),
|
2025-01-12 16:17:24 +08:00
|
|
|
|
"deepseek_vl_v2": VLMTestInfo(
|
2025-01-18 13:59:39 +08:00
|
|
|
|
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module
|
2025-01-12 16:17:24 +08:00
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
2025-01-17 01:14:48 +08:00
|
|
|
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
2025-01-12 16:17:24 +08:00
|
|
|
|
}),
|
2025-01-17 01:14:48 +08:00
|
|
|
|
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
2025-01-12 16:17:24 +08:00
|
|
|
|
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
|
|
|
|
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
|
|
|
|
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
2025-01-17 01:14:48 +08:00
|
|
|
|
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
2025-01-12 16:17:24 +08:00
|
|
|
|
),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
"fuyu": VLMTestInfo(
|
|
|
|
|
models=["adept/fuyu-8b"],
|
|
|
|
|
test_type=VLMTestType.IMAGE,
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"{img_prompt}\n",
|
|
|
|
|
img_idx_to_prompt=lambda idx: "",
|
|
|
|
|
max_model_len=2048,
|
|
|
|
|
max_num_seqs=2,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
use_tokenizer_eos=True,
|
|
|
|
|
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
|
|
|
|
num_logprobs=10,
|
|
|
|
|
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
|
|
|
|
),
|
2025-03-13 17:23:12 +08:00
|
|
|
|
"gemma3": VLMTestInfo(
|
|
|
|
|
models=["google/gemma-3-4b-it"],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
|
|
|
|
|
}),
|
|
|
|
|
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2025-03-13 17:23:12 +08:00
|
|
|
|
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
|
|
|
|
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
|
|
|
|
),
|
2025-02-13 22:19:15 +08:00
|
|
|
|
"glm4v": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["THUDM/glm-4v-9b"],
|
|
|
|
|
test_type=VLMTestType.IMAGE,
|
2025-03-13 19:37:17 +08:00
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
|
|
|
|
|
}),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
max_model_len=2048,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
2025-03-13 19:37:17 +08:00
|
|
|
|
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
|
|
|
|
|
# The image embeddings match with HF but the outputs of the language
|
|
|
|
|
# decoder are only consistent up to 2 decimal places.
|
|
|
|
|
# So, we need to reduce the number of tokens for the test to pass.
|
|
|
|
|
max_tokens=8,
|
|
|
|
|
num_logprobs=10,
|
2024-12-28 01:22:48 +08:00
|
|
|
|
marks=[large_gpu_mark(min_gb=32)],
|
2024-11-03 18:15:36 -06:00
|
|
|
|
),
|
|
|
|
|
"h2ovl": VLMTestInfo(
|
|
|
|
|
models = [
|
|
|
|
|
"h2oai/h2ovl-mississippi-800m",
|
2025-04-01 00:01:35 +08:00
|
|
|
|
# TODO: Re-enable once head_dim = 80 is supported
|
|
|
|
|
# "h2oai/h2ovl-mississippi-2b",
|
2024-11-03 18:15:36 -06:00
|
|
|
|
],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<image>\nWhat is the season?",
|
|
|
|
|
}),
|
|
|
|
|
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
|
|
|
|
max_model_len=8192,
|
|
|
|
|
use_tokenizer_eos=True,
|
2025-02-04 16:44:52 +08:00
|
|
|
|
num_logprobs=10,
|
2024-11-03 18:15:36 -06:00
|
|
|
|
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
),
|
2024-11-10 03:39:14 +08:00
|
|
|
|
"idefics3": VLMTestInfo(
|
2025-02-04 20:00:51 +08:00
|
|
|
|
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
2024-11-10 03:39:14 +08:00
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda idx: "<image>",
|
|
|
|
|
max_model_len=8192,
|
|
|
|
|
max_num_seqs=2,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2025-02-04 20:00:51 +08:00
|
|
|
|
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
2024-11-10 03:39:14 +08:00
|
|
|
|
),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
"intern_vl": VLMTestInfo(
|
|
|
|
|
models=[
|
|
|
|
|
"OpenGVLab/InternVL2-1B",
|
|
|
|
|
"OpenGVLab/InternVL2-2B",
|
|
|
|
|
"OpenGVLab/Mono-InternVL-2B",
|
|
|
|
|
],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<image>\nWhat is the season?",
|
|
|
|
|
}),
|
|
|
|
|
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
use_tokenizer_eos=True,
|
|
|
|
|
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
|
|
|
|
),
|
2025-04-07 08:06:27 -07:00
|
|
|
|
"llama4": VLMTestInfo(
|
|
|
|
|
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda _: "<|image|>",
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
distributed_executor_backend="mp",
|
|
|
|
|
image_size_factors=[(.25, 0.5, 1.0)],
|
|
|
|
|
hf_model_kwargs={"device_map": "auto"},
|
|
|
|
|
max_model_len=8192,
|
|
|
|
|
max_num_seqs=4,
|
|
|
|
|
dtype="bfloat16",
|
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
|
|
|
|
tensor_parallel_size=8,
|
|
|
|
|
vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
|
|
|
|
|
marks=multi_gpu_marks(num_gpus=8),
|
|
|
|
|
),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
"llava_next": VLMTestInfo(
|
|
|
|
|
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
|
|
|
|
max_model_len=10240,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
|
|
|
|
custom_test_opts=[CustomTestOptions(
|
|
|
|
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
|
|
|
|
formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]"
|
|
|
|
|
),
|
|
|
|
|
limit_mm_per_prompt={"image": 4},
|
|
|
|
|
)],
|
|
|
|
|
),
|
2025-01-04 19:40:53 +08:00
|
|
|
|
"llava_onevision": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
|
|
|
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
|
|
|
|
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
|
|
|
num_video_frames=16,
|
|
|
|
|
max_model_len=16384,
|
2025-03-31 16:59:37 +01:00
|
|
|
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
2024-10-30 10:32:17 -06:00
|
|
|
|
auto_cls=AutoModelForVision2Seq,
|
|
|
|
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
|
|
|
|
custom_test_opts=[CustomTestOptions(
|
|
|
|
|
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
|
|
|
|
|
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
|
|
|
),
|
|
|
|
|
limit_mm_per_prompt={"video": 4},
|
2024-10-31 10:10:52 -06:00
|
|
|
|
runner_mm_key="videos",
|
2024-10-30 10:32:17 -06:00
|
|
|
|
)],
|
|
|
|
|
),
|
|
|
|
|
"llava_next_video": VLMTestInfo(
|
|
|
|
|
models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
|
|
|
|
|
test_type=VLMTestType.VIDEO,
|
|
|
|
|
prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
|
|
|
|
|
num_video_frames=16,
|
|
|
|
|
max_model_len=4096,
|
2025-03-25 18:22:52 +08:00
|
|
|
|
max_num_seqs=2,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
auto_cls=AutoModelForVision2Seq,
|
|
|
|
|
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
|
|
|
|
|
),
|
2024-12-08 01:10:05 +08:00
|
|
|
|
"mantis": VLMTestInfo(
|
|
|
|
|
models=["TIGER-Lab/Mantis-8B-siglip-llama3"],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
get_stop_token_ids=lambda tok: [128009],
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-12-08 01:10:05 +08:00
|
|
|
|
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
|
|
|
|
patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
|
|
|
|
),
|
2024-11-29 12:47:06 +08:00
|
|
|
|
"minicpmv_25": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["openbmb/MiniCPM-Llama3-V-2_5"],
|
2024-11-29 12:47:06 +08:00
|
|
|
|
test_type=VLMTestType.IMAGE,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
|
2024-11-29 12:47:06 +08:00
|
|
|
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
2025-03-19 13:49:33 +08:00
|
|
|
|
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
|
2024-11-29 12:47:06 +08:00
|
|
|
|
),
|
2025-01-29 17:24:59 +08:00
|
|
|
|
"minicpmo_26": VLMTestInfo(
|
|
|
|
|
models=["openbmb/MiniCPM-o-2_6"],
|
2025-03-30 18:20:42 +08:00
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
2025-03-25 18:22:52 +08:00
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
|
|
|
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
|
|
|
|
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
|
|
|
|
),
|
2024-11-29 12:47:06 +08:00
|
|
|
|
"minicpmv_26": VLMTestInfo(
|
|
|
|
|
models=["openbmb/MiniCPM-V-2_6"],
|
2025-03-30 18:20:42 +08:00
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
2025-03-25 18:22:52 +08:00
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
|
|
|
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
|
|
|
|
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
|
|
|
|
),
|
2025-01-06 23:22:25 +08:00
|
|
|
|
"molmo": VLMTestInfo(
|
|
|
|
|
models=["allenai/Molmo-7B-D-0924"],
|
2025-03-26 11:26:33 +08:00
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
2025-02-13 20:34:00 +08:00
|
|
|
|
prompt_formatter=identity,
|
2025-01-06 23:22:25 +08:00
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
2025-02-13 20:34:00 +08:00
|
|
|
|
patch_hf_runner=model_utils.molmo_patch_hf_runner,
|
2025-01-06 23:22:25 +08:00
|
|
|
|
),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
# Tests for phi3v currently live in another file because of a bug in
|
|
|
|
|
# transformers. Once this issue is fixed, we can enable them here instead.
|
|
|
|
|
# https://github.com/huggingface/transformers/issues/34307
|
|
|
|
|
# "phi3v": VLMTestInfo(
|
|
|
|
|
# models=["microsoft/Phi-3.5-vision-instruct"],
|
|
|
|
|
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
|
|
|
|
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
|
|
|
|
# max_model_len=4096,
|
|
|
|
|
# max_num_seqs=2,
|
|
|
|
|
# task="generate",
|
|
|
|
|
# # use eager mode for hf runner since phi3v didn't work with flash_attn
|
2024-12-08 01:10:05 +08:00
|
|
|
|
# hf_model_kwargs={"_attn_implementation": "eager"},
|
2024-10-30 10:32:17 -06:00
|
|
|
|
# use_tokenizer_eos=True,
|
|
|
|
|
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
|
|
|
|
|
# num_logprobs=10,
|
|
|
|
|
# ),
|
2024-11-01 09:55:29 -04:00
|
|
|
|
"pixtral_hf": VLMTestInfo(
|
|
|
|
|
models=["nm-testing/pixtral-12b-FP8-dynamic"],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<s>[INST]{img_prompt}[/INST]",
|
|
|
|
|
img_idx_to_prompt=lambda idx: "[IMG]",
|
|
|
|
|
max_model_len=8192,
|
|
|
|
|
max_num_seqs=2,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-11-10 03:39:14 +08:00
|
|
|
|
marks=[large_gpu_mark(min_gb=48)],
|
2024-11-01 09:55:29 -04:00
|
|
|
|
),
|
2025-02-13 22:19:15 +08:00
|
|
|
|
"qwen_vl": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["Qwen/Qwen-VL"],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=identity,
|
|
|
|
|
img_idx_to_prompt=lambda idx: f"Picture {idx}: <img></img>\n",
|
|
|
|
|
max_model_len=1024,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
|
|
|
|
|
prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
|
|
|
|
|
),
|
2025-03-31 16:59:37 +01:00
|
|
|
|
"qwen2_vl": VLMTestInfo(
|
|
|
|
|
models=["Qwen/Qwen2-VL-2B-Instruct"],
|
|
|
|
|
test_type=(
|
|
|
|
|
VLMTestType.IMAGE,
|
|
|
|
|
VLMTestType.MULTI_IMAGE,
|
|
|
|
|
VLMTestType.VIDEO
|
|
|
|
|
),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
|
|
|
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
|
|
|
|
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
auto_cls=AutoModelForVision2Seq,
|
|
|
|
|
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
|
|
|
|
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
|
|
|
|
marks=[pytest.mark.cpu_model],
|
|
|
|
|
),
|
2025-03-29 11:39:21 +08:00
|
|
|
|
"skywork_r1v": VLMTestInfo(
|
|
|
|
|
models=["Skywork/Skywork-R1V-38B"],
|
|
|
|
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
|
|
|
|
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
|
|
|
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
|
|
|
|
"cherry_blossom": "<image>\nWhat is the season?",
|
|
|
|
|
}),
|
|
|
|
|
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
use_tokenizer_eos=True,
|
|
|
|
|
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
|
|
|
|
|
marks=[large_gpu_mark(min_gb=80)],
|
|
|
|
|
),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
### Tensor parallel / multi-gpu broadcast tests
|
2024-12-12 06:18:16 +08:00
|
|
|
|
"chameleon-broadcast": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["facebook/chameleon-7b"],
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
|
|
|
|
max_model_len=4096,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
|
|
|
|
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
|
|
|
|
comparator=check_outputs_equal,
|
2024-12-12 06:18:16 +08:00
|
|
|
|
marks=multi_gpu_marks(num_gpus=2),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
|
|
|
|
),
|
2024-12-12 06:18:16 +08:00
|
|
|
|
"llava-broadcast": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["llava-hf/llava-1.5-7b-hf"],
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
|
|
|
|
max_model_len=4096,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
2024-12-12 06:18:16 +08:00
|
|
|
|
marks=multi_gpu_marks(num_gpus=2),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
|
|
|
|
),
|
2024-12-12 06:18:16 +08:00
|
|
|
|
"llava_next-broadcast": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
|
|
|
|
max_model_len=10240,
|
2025-03-18 02:35:17 +08:00
|
|
|
|
auto_cls=AutoModelForImageTextToText,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
2024-12-12 06:18:16 +08:00
|
|
|
|
marks=multi_gpu_marks(num_gpus=2),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
|
|
|
|
),
|
|
|
|
|
### Custom input edge-cases for specific models
|
|
|
|
|
"intern_vl-diff-patches": VLMTestInfo(
|
|
|
|
|
models=["OpenGVLab/InternVL2-2B"],
|
|
|
|
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
|
|
|
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
use_tokenizer_eos=True,
|
|
|
|
|
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
|
|
|
|
custom_test_opts=[
|
|
|
|
|
CustomTestOptions(
|
|
|
|
|
inputs=inp,
|
|
|
|
|
limit_mm_per_prompt={"image": 2},
|
|
|
|
|
) for inp in custom_inputs.different_patch_input_cases_internvl()
|
|
|
|
|
],
|
|
|
|
|
),
|
2025-01-04 19:40:53 +08:00
|
|
|
|
"llava_onevision-multiple-images": VLMTestInfo(
|
2024-10-30 10:32:17 -06:00
|
|
|
|
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
|
|
|
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
|
|
|
|
max_model_len=16384,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
auto_cls=AutoModelForVision2Seq,
|
2025-03-31 16:59:37 +01:00
|
|
|
|
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
2024-10-30 10:32:17 -06:00
|
|
|
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
|
|
|
|
custom_test_opts=[CustomTestOptions(
|
|
|
|
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
|
|
|
|
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
|
|
|
|
),
|
|
|
|
|
limit_mm_per_prompt={"image": 4},
|
|
|
|
|
)],
|
|
|
|
|
),
|
2025-03-21 10:18:04 +08:00
|
|
|
|
# regression test for https://github.com/vllm-project/vllm/issues/15122
|
|
|
|
|
"qwen2_5_vl-windows-attention": VLMTestInfo(
|
|
|
|
|
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
|
|
|
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
|
|
|
|
max_model_len=4096,
|
|
|
|
|
max_num_seqs=2,
|
|
|
|
|
auto_cls=AutoModelForVision2Seq,
|
|
|
|
|
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
|
|
|
|
custom_test_opts=[CustomTestOptions(
|
|
|
|
|
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
|
|
|
|
limit_mm_per_prompt={"image": 1},
|
|
|
|
|
)],
|
|
|
|
|
),
|
2024-10-30 10:32:17 -06:00
|
|
|
|
}
|
|
|
|
|
# yapf: enable
|
|
|
|
|
|
|
|
|
|
|
2024-12-12 06:18:16 +08:00
|
|
|
|
def _mark_splits(
|
|
|
|
|
test_settings: dict[str, VLMTestInfo],
|
|
|
|
|
*,
|
|
|
|
|
num_groups: int,
|
|
|
|
|
) -> dict[str, VLMTestInfo]:
|
|
|
|
|
name_by_test_info_id = {id(v): k for k, v in test_settings.items()}
|
|
|
|
|
test_infos_by_model = defaultdict[str, list[VLMTestInfo]](list)
|
|
|
|
|
|
|
|
|
|
for info in test_settings.values():
|
|
|
|
|
for model in info.models:
|
|
|
|
|
test_infos_by_model[model].append(info)
|
|
|
|
|
|
|
|
|
|
models = sorted(test_infos_by_model.keys())
|
|
|
|
|
split_size = math.ceil(len(models) / num_groups)
|
|
|
|
|
|
|
|
|
|
new_test_settings = dict[str, VLMTestInfo]()
|
|
|
|
|
|
|
|
|
|
for i in range(num_groups):
|
|
|
|
|
models_in_group = models[i * split_size:(i + 1) * split_size]
|
|
|
|
|
|
|
|
|
|
for model in models_in_group:
|
|
|
|
|
for info in test_infos_by_model[model]:
|
|
|
|
|
new_marks = (info.marks or []) + [pytest.mark.split(group=i)]
|
|
|
|
|
new_info = info._replace(marks=new_marks)
|
|
|
|
|
new_test_settings[name_by_test_info_id[id(info)]] = new_info
|
|
|
|
|
|
|
|
|
|
missing_keys = test_settings.keys() - new_test_settings.keys()
|
|
|
|
|
assert not missing_keys, f"Missing keys: {missing_keys}"
|
|
|
|
|
|
|
|
|
|
return new_test_settings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
|
|
|
|
|
|
|
|
|
|
2024-10-30 10:32:17 -06:00
|
|
|
|
### Test wrappers
|
|
|
|
|
# Wrappers around the core test running func for:
|
|
|
|
|
# - single image
|
|
|
|
|
# - multi-image
|
|
|
|
|
# - image embeddings
|
|
|
|
|
# - video
|
|
|
|
|
# - custom inputs
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.IMAGE,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=False,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
image_assets: _ImageAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_single_image_test(
|
|
|
|
|
tmp_path=tmp_path,
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
image_assets=image_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.MULTI_IMAGE,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=False,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
image_assets: _ImageAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_multi_image_test(
|
|
|
|
|
tmp_path=tmp_path,
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
image_assets=image_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.EMBEDDING,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=False,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_image_embedding_models(model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
image_assets: _ImageAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_embedding_test(
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
image_assets=image_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.VIDEO,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=False,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
video_assets: _VideoAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_video_test(
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
video_assets=video_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=False,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_custom_inputs_models(
|
|
|
|
|
model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
monkeypatch,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
):
|
2025-03-15 01:02:20 -04:00
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_custom_inputs_test(
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### Tests filtering for things running each test as a new process
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.IMAGE,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=True,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2025-03-17 19:33:35 +08:00
|
|
|
|
@create_new_process_for_each_test()
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
image_assets: _ImageAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_single_image_test(
|
|
|
|
|
tmp_path=tmp_path,
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
image_assets=image_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.MULTI_IMAGE,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=True,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2025-03-17 19:33:35 +08:00
|
|
|
|
@create_new_process_for_each_test()
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
image_assets: _ImageAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_multi_image_test(
|
|
|
|
|
tmp_path=tmp_path,
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
image_assets=image_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.EMBEDDING,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=True,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2025-03-17 19:33:35 +08:00
|
|
|
|
@create_new_process_for_each_test()
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_image_embedding_models_heavy(model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
image_assets: _ImageAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_embedding_test(
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
image_assets=image_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.VIDEO,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=True,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
video_assets: _VideoAssets, monkeypatch):
|
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_video_test(
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
video_assets=video_assets,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2025-01-28 00:23:08 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"model_type,test_case",
|
|
|
|
|
get_parametrized_options(
|
|
|
|
|
VLM_TEST_SETTINGS,
|
|
|
|
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
2025-03-17 19:33:35 +08:00
|
|
|
|
create_new_process_for_each_test=True,
|
2025-01-28 00:23:08 +00:00
|
|
|
|
))
|
2025-03-17 19:33:35 +08:00
|
|
|
|
@create_new_process_for_each_test()
|
2024-10-30 10:32:17 -06:00
|
|
|
|
def test_custom_inputs_models_heavy(
|
|
|
|
|
model_type: str,
|
|
|
|
|
test_case: ExpandableVLMTestArgs,
|
2025-03-03 01:34:51 +00:00
|
|
|
|
hf_runner: type[HfRunner],
|
|
|
|
|
vllm_runner: type[VllmRunner],
|
2025-03-15 01:02:20 -04:00
|
|
|
|
monkeypatch,
|
2024-10-30 10:32:17 -06:00
|
|
|
|
):
|
2025-03-15 01:02:20 -04:00
|
|
|
|
if model_type in REQUIRES_V0_MODELS:
|
|
|
|
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
2024-10-30 10:32:17 -06:00
|
|
|
|
model_test_info = VLM_TEST_SETTINGS[model_type]
|
|
|
|
|
runners.run_custom_inputs_test(
|
|
|
|
|
model_test_info=model_test_info,
|
|
|
|
|
test_case=test_case,
|
|
|
|
|
hf_runner=hf_runner,
|
|
|
|
|
vllm_runner=vllm_runner,
|
|
|
|
|
)
|