From c67abd614fe670b1cc771097658dd7efe4a33747 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 29 Mar 2025 06:30:09 -0700 Subject: [PATCH] [V1] Support interleaved modality items (#15605) Signed-off-by: Roger Wang --- .buildkite/test-pipeline.yaml | 1 + tests/conftest.py | 39 +++++---- .../vision_language/test_interleaved.py | 77 ++++++++++++++++++ tests/multimodal/test_utils.py | 80 +++++++++++++++---- vllm/multimodal/utils.py | 72 ++++++----------- vllm/v1/engine/processor.py | 51 +++++------- 6 files changed, 205 insertions(+), 115 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_interleaved.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 62872bf8..99358d55 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -431,6 +431,7 @@ steps: - pytest -v -s models/encoder_decoder/audio_language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model + - pytest -v -s models/decoder_only/vision_language/test_interleaved.py - label: Multi-Modal Models Test (Extended) 1 # 48m optional: true diff --git a/tests/conftest.py b/tests/conftest.py index cc48fceb..6627ab63 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -747,30 +747,27 @@ class VllmRunner: videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, ) -> list[TextPrompt]: - if images is not None: - assert len(prompts) == len(images) - if videos is not None: - assert len(prompts) == len(videos) + if any(x is not None and len(x) != len(prompts) + for x in [images, videos, audios]): + raise ValueError( + "All non-None multimodal inputs must have the same length as " + "prompts") - if audios is not None: - assert len(prompts) == len(audios) + inputs = [] + for i, prompt in enumerate(prompts): + multi_modal_data = {} + if images is not None and (image := images[i]) is not None: + multi_modal_data["image"] = image + if videos is not None and (video := videos[i]) is not None: + multi_modal_data["video"] = video + if audios is not None and (audio := audios[i]) is not None: + multi_modal_data["audio"] = audio - inputs = [TextPrompt(prompt=prompt) for prompt in prompts] - if images is not None: - for i, image in enumerate(images): - if image is not None: - inputs[i]["multi_modal_data"] = {"image": image} - - if videos is not None: - for i, video in enumerate(videos): - if video is not None: - inputs[i]["multi_modal_data"] = {"video": video} - - if audios is not None: - for i, audio in enumerate(audios): - if audio is not None: - inputs[i]["multi_modal_data"] = {"audio": audio} + inputs.append( + TextPrompt(prompt=prompt, + multi_modal_data=multi_modal_data + if multi_modal_data else None)) return inputs diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/decoder_only/vision_language/test_interleaved.py new file mode 100644 index 00000000..8804497a --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_interleaved.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset + +models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"] + + +def base_prompt(modalities_str: str) -> str: + return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501 + + +INTERLEAVED_PROMPT = base_prompt("