vllm/tests/multimodal/test_mapper.py

from contextlib import nullcontext

import numpy as np
import pytest
from transformers import CLIPImageProcessor, LlavaNextImageProcessor

from vllm.config import ModelConfig
from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size


@pytest.fixture
def mm_registry():
    return MultiModalRegistry()


@pytest.mark.parametrize("dtype", ["half", "float"])
@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"

    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, CLIPImageProcessor)

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
        limit_mm_per_prompt={"image": 1},
    )

    mm_registry.init_mm_limits_per_prompt(model_config)

    for asset in image_assets:
        image = rescale_image_size(asset.pil_image, size_factor)

        hf_result = hf_processor.preprocess(
            image,
            return_tensors="pt",
        )
        vllm_result = mm_registry.map_input(
            model_config,
            {"image": image},
        )

        assert hf_result.keys() == vllm_result.keys()
        for key, hf_tensor in hf_result.items():
            hf_arr: np.ndarray = hf_tensor.numpy()
            vllm_arr: np.ndarray = vllm_result[key].numpy()

            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"


@pytest.mark.parametrize("dtype", ["half", "float"])
@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
def test_llava_next_image_processor(image_assets, mm_registry, dtype,
                                    size_factor):
    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"

    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, LlavaNextImageProcessor)

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype=dtype,
        revision=None,
        limit_mm_per_prompt={"image": 1},
    )

    mm_registry.init_mm_limits_per_prompt(model_config)

    for asset in image_assets:
        image = rescale_image_size(asset.pil_image, size_factor)

        hf_result = hf_processor.preprocess(
            image,
            return_tensors="pt",
        )
        vllm_result = mm_registry.map_input(
            model_config,
            {"image": image},
        )

        assert hf_result.keys() == vllm_result.keys()
        for key, hf_tensor in hf_result.items():
            hf_arr: np.ndarray = hf_tensor.numpy()
            vllm_arr: np.ndarray = vllm_result[key].numpy()

            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"


@pytest.mark.parametrize(
    ("num_images", "limit", "is_valid"),
    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
     (2, 1, False), (2, 2, True)],
)
def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype="half",
        revision=None,
        limit_mm_per_prompt={"image": limit},
    )

    mm_registry.init_mm_limits_per_prompt(model_config)

    image = image_assets[0].pil_image
    if num_images == 0:
        mm_inputs = {}
    elif num_images == 1:
        mm_inputs = {"image": image}
    else:
        mm_inputs = {"image": [image] * num_images}

    with nullcontext() if is_valid else pytest.raises(ValueError):
        mm_registry.map_input(model_config, mm_inputs)


# NOTE: We don't test zero images since the HF processor doesn't support it
@pytest.mark.parametrize("num_images", [1, 2])
def test_image_mapper_multi(image_assets, mm_registry, num_images):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"

    model_config = ModelConfig(
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype="half",
        revision=None,
        limit_mm_per_prompt={"image": num_images},
    )

    mm_registry.init_mm_limits_per_prompt(model_config)

    image = image_assets[0].pil_image
    mm_inputs = {"image": [image] * num_images}

    mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
    assert len(mapped_inputs["pixel_values"]) == num_images
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`from contextlib import nullcontext`

[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00			`import numpy as np`
			`import pytest`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`from transformers import CLIPImageProcessor, LlavaNextImageProcessor`
[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`from vllm.config import ModelConfig`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`from vllm.multimodal import MultiModalRegistry`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`from vllm.multimodal.utils import rescale_image_size`
[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00
[CI/Build] Add inputs tests (#5215) 2024-06-04 12:01:46 +08:00
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`@pytest.fixture`
			`def mm_registry():`
			`return MultiModalRegistry()`


[CI/Build] Add inputs tests (#5215) 2024-06-04 12:01:46 +08:00			`@pytest.mark.parametrize("dtype", ["half", "float"])`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):`
[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00			`MODEL_NAME = "llava-hf/llava-1.5-7b-hf"`

			`hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)`
			`assert isinstance(hf_processor, CLIPImageProcessor)`

			`model_config = ModelConfig(`
			`model=MODEL_NAME,`
			`tokenizer=MODEL_NAME,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype=dtype,`
			`revision=None,`
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`limit_mm_per_prompt={"image": 1},`
[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00			`)`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`mm_registry.init_mm_limits_per_prompt(model_config)`
[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00
[CI/Build] Refactor image test assets (#5821) 2024-06-26 16:02:34 +08:00			`for asset in image_assets:`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`image = rescale_image_size(asset.pil_image, size_factor)`

[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00			`hf_result = hf_processor.preprocess(`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`image,`
[CI/Build] Add inputs tests (#5215) 2024-06-04 12:01:46 +08:00			`return_tensors="pt",`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`)`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`vllm_result = mm_registry.map_input(`
[Core] Registry for processing model inputs (#5214) Co-authored-by: ywang96 <ywang@roblox.com> 2024-06-28 20:09:56 +08:00			`model_config,`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`{"image": image},`
[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00			`)`

			`assert hf_result.keys() == vllm_result.keys()`
[CI/Build] Add inputs tests (#5215) 2024-06-04 12:01:46 +08:00			`for key, hf_tensor in hf_result.items():`
			`hf_arr: np.ndarray = hf_tensor.numpy()`
[Core] Support image processor (#4197) 2024-06-03 13:56:41 +08:00			`vllm_arr: np.ndarray = vllm_result[key].numpy()`

			`assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"`
			`assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"`


[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`@pytest.mark.parametrize("dtype", ["half", "float"])`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`def test_llava_next_image_processor(image_assets, mm_registry, dtype,`
			`size_factor):`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00
			`hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)`
			`assert isinstance(hf_processor, LlavaNextImageProcessor)`

			`model_config = ModelConfig(`
			`model=MODEL_NAME,`
			`tokenizer=MODEL_NAME,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype=dtype,`
			`revision=None,`
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`limit_mm_per_prompt={"image": 1},`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`)`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`mm_registry.init_mm_limits_per_prompt(model_config)`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00
[CI/Build] Refactor image test assets (#5821) 2024-06-26 16:02:34 +08:00			`for asset in image_assets:`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`image = rescale_image_size(asset.pil_image, size_factor)`

[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`hf_result = hf_processor.preprocess(`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`image,`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`return_tensors="pt",`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`)`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`vllm_result = mm_registry.map_input(`
[Core] Registry for processing model inputs (#5214) Co-authored-by: ywang96 <ywang@roblox.com> 2024-06-28 20:09:56 +08:00			`model_config,`
[Core] Dynamic image size support for VLMs (#5276) Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> 2024-07-03 11:34:00 +08:00			`{"image": image},`
[Model] Initial support for LLaVA-NeXT (#4199) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-06-10 20:47:15 +08:00			`)`

			`assert hf_result.keys() == vllm_result.keys()`
			`for key, hf_tensor in hf_result.items():`
			`hf_arr: np.ndarray = hf_tensor.numpy()`
			`vllm_arr: np.ndarray = vllm_result[key].numpy()`

			`assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"`
			`assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00

			`@pytest.mark.parametrize(`
			`("num_images", "limit", "is_valid"),`
			`[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),`
			`(2, 1, False), (2, 2, True)],`
			`)`
			`def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):`
			`MODEL_NAME = "llava-hf/llava-1.5-7b-hf"`

			`model_config = ModelConfig(`
			`model=MODEL_NAME,`
			`tokenizer=MODEL_NAME,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="half",`
			`revision=None,`
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`limit_mm_per_prompt={"image": limit},`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`)`

[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`mm_registry.init_mm_limits_per_prompt(model_config)`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00
			`image = image_assets[0].pil_image`
			`if num_images == 0:`
			`mm_inputs = {}`
			`elif num_images == 1:`
			`mm_inputs = {"image": image}`
			`else:`
			`mm_inputs = {"image": [image] * num_images}`

			`with nullcontext() if is_valid else pytest.raises(ValueError):`
			`mm_registry.map_input(model_config, mm_inputs)`


			`# NOTE: We don't test zero images since the HF processor doesn't support it`
			`@pytest.mark.parametrize("num_images", [1, 2])`
			`def test_image_mapper_multi(image_assets, mm_registry, num_images):`
			`MODEL_NAME = "llava-hf/llava-1.5-7b-hf"`

			`model_config = ModelConfig(`
			`model=MODEL_NAME,`
			`tokenizer=MODEL_NAME,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="half",`
			`revision=None,`
[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`limit_mm_per_prompt={"image": num_images},`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00			`)`

[VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) 2024-08-17 13:30:55 -07:00			`mm_registry.init_mm_limits_per_prompt(model_config)`
[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126) 2024-08-15 01:55:42 +08:00
			`image = image_assets[0].pil_image`
			`mm_inputs = {"image": [image] * num_images}`

			`mapped_inputs = mm_registry.map_input(model_config, mm_inputs)`
			`assert len(mapped_inputs["pixel_values"]) == num_images`