vllm/tests/entrypoints/test_chat_utils.py

import warnings
from typing import Optional

import pytest
from PIL import Image

from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (parse_chat_messages,
                                         parse_chat_messages_futures)
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.utils import encode_image_base64
from vllm.transformers_utils.tokenizer_group import TokenizerGroup

PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"


@pytest.fixture(scope="module")
def phi3v_model_config():
    return ModelConfig(PHI3V_MODEL_ID,
                       task="generate",
                       tokenizer=PHI3V_MODEL_ID,
                       tokenizer_mode="auto",
                       trust_remote_code=True,
                       dtype="bfloat16",
                       seed=0,
                       limit_mm_per_prompt={
                           "image": 2,
                       })


@pytest.fixture(scope="module")
def phi3v_tokenizer():
    return TokenizerGroup(
        tokenizer_id=PHI3V_MODEL_ID,
        enable_lora=False,
        max_num_seqs=5,
        max_input_length=None,
    )


@pytest.fixture(scope="module")
def image_url():
    image = ImageAsset('cherry_blossom')
    base64 = encode_image_base64(image.pil_image)
    return f"data:image/jpeg;base64,{base64}"


def _assert_mm_data_is_image_input(
    mm_data: Optional[MultiModalDataDict],
    image_count: int,
) -> None:
    assert mm_data is not None
    assert set(mm_data.keys()) == {"image"}

    image_data = mm_data.get("image")
    assert image_data is not None

    if image_count == 1:
        assert isinstance(image_data, Image.Image)
    else:
        assert isinstance(image_data, list) and len(image_data) == image_count


def test_parse_chat_messages_single_image(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "text",
            "text": "What's in the image?"
        }]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [{
        "role": "user",
        "content": "<|image_1|>\nWhat's in the image?"
    }]
    _assert_mm_data_is_image_input(mm_data, 1)


@pytest.mark.asyncio
async def test_parse_chat_messages_single_image_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_future = parse_chat_messages_futures([{
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "text",
            "text": "What's in the image?"
        }]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [{
        "role": "user",
        "content": "<|image_1|>\nWhat's in the image?"
    }]
    _assert_mm_data_is_image_input(await mm_future, 1)


def test_parse_chat_messages_multiple_images(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "text",
            "text": "What's in these images?"
        }]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [{
        "role":
        "user",
        "content":
        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
    }]
    _assert_mm_data_is_image_input(mm_data, 2)


@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_future = parse_chat_messages_futures([{
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "text",
            "text": "What's in these images?"
        }]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [{
        "role":
        "user",
        "content":
        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
    }]
    _assert_mm_data_is_image_input(await mm_future, 2)


def test_parse_chat_messages_placeholder_already_in_prompt(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type":
            "text",
            "text":
            "What's in <|image_1|> and how does it compare to <|image_2|>?"
        }]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [{
        "role":
        "user",
        "content":
        "What's in <|image_1|> and how does it compare to <|image_2|>?"
    }]
    _assert_mm_data_is_image_input(mm_data, 2)


def test_parse_chat_messages_placeholder_one_already_in_prompt(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type":
            "text",
            "text":
            "What's in <|image_1|> and how does it compare to the other one?"
        }]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [{
        "role":
        "user",
        "content":
        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
        "other one?"
    }]
    _assert_mm_data_is_image_input(mm_data, 2)


def test_parse_chat_messages_multiple_images_across_messages(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "text",
            "text": "What's in this image?"
        }]
    }, {
        "role": "assistant",
        "content": "Some stuff."
    }, {
        "role":
        "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": image_url
            }
        }, {
            "type": "text",
            "text": "What about this one?"
        }]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?"
        },
        {
            "role": "assistant",
            "content": "Some stuff."
        },
        {
            "role": "user",
            "content": "<|image_2|>\nWhat about this one?"
        },
    ]
    _assert_mm_data_is_image_input(mm_data, 2)


def test_parse_chat_messages_rejects_too_many_images_in_one_message(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="coroutine 'async_get_and_parse_image' was never awaited")
        with pytest.raises(
                ValueError,
                match="At most 2 image\\(s\\) may be provided in one request\\."
        ):
            parse_chat_messages([{
                "role":
                "user",
                "content": [{
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                }, {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                }, {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                }, {
                    "type": "text",
                    "text": "What's in these images?"
                }]
            }], phi3v_model_config, phi3v_tokenizer)


def test_parse_chat_messages_rejects_too_many_images_across_messages(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="coroutine 'async_get_and_parse_image' was never awaited")
        with pytest.raises(
                ValueError,
                match="At most 2 image\\(s\\) may be provided in one request\\."
        ):
            parse_chat_messages([{
                "role":
                "user",
                "content": [{
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                }, {
                    "type": "text",
                    "text": "What's in this image?"
                }]
            }, {
                "role": "assistant",
                "content": "Some stuff."
            }, {
                "role":
                "user",
                "content": [{
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                }, {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                }, {
                    "type": "text",
                    "text": "What about these two?"
                }]
            }], phi3v_model_config, phi3v_tokenizer)


def test_parse_chat_messages_multiple_images_uncommon_input(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [
            "What's in these images?", {
                "image_url": image_url
            }, {
                "image_url": image_url
            }
        ]
    }], phi3v_model_config, phi3v_tokenizer)

    assert conversation == [{
        "role":
        "user",
        "content":
        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
    }]
    _assert_mm_data_is_image_input(mm_data, 2)
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`import warnings`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`from typing import Optional`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00
			`import pytest`
			`from PIL import Image`

			`from vllm.assets.image import ImageAsset`
			`from vllm.config import ModelConfig`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`from vllm.entrypoints.chat_utils import (parse_chat_messages,`
			`parse_chat_messages_futures)`
			`from vllm.multimodal import MultiModalDataDict`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`from vllm.multimodal.utils import encode_image_base64`
			`from vllm.transformers_utils.tokenizer_group import TokenizerGroup`

			`PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"`


			`@pytest.fixture(scope="module")`
			`def phi3v_model_config():`
			`return ModelConfig(PHI3V_MODEL_ID,`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="generate",`
			`tokenizer=PHI3V_MODEL_ID,`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`tokenizer_mode="auto",`
			`trust_remote_code=True,`
			`dtype="bfloat16",`
			`seed=0,`
			`limit_mm_per_prompt={`
			`"image": 2,`
			`})`


			`@pytest.fixture(scope="module")`
			`def phi3v_tokenizer():`
			`return TokenizerGroup(`
			`tokenizer_id=PHI3V_MODEL_ID,`
			`enable_lora=False,`
			`max_num_seqs=5,`
			`max_input_length=None,`
			`)`


			`@pytest.fixture(scope="module")`
			`def image_url():`
			`image = ImageAsset('cherry_blossom')`
			`base64 = encode_image_base64(image.pil_image)`
			`return f"data:image/jpeg;base64,{base64}"`


[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`def _assert_mm_data_is_image_input(`
			`mm_data: Optional[MultiModalDataDict],`
			`image_count: int,`
			`) -> None:`
			`assert mm_data is not None`
			`assert set(mm_data.keys()) == {"image"}`

			`image_data = mm_data.get("image")`
			`assert image_data is not None`

			`if image_count == 1:`
			`assert isinstance(image_data, Image.Image)`
			`else:`
			`assert isinstance(image_data, list) and len(image_data) == image_count`


			`def test_parse_chat_messages_single_image(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_data = parse_chat_messages([{`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What's in the image?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [{`
			`"role": "user",`
			`"content": "<\|image_1\|>\nWhat's in the image?"`
			`}]`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`_assert_mm_data_is_image_input(mm_data, 1)`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00

			`@pytest.mark.asyncio`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`async def test_parse_chat_messages_single_image_async(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_future = parse_chat_messages_futures([{`
			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What's in the image?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [{`
			`"role": "user",`
			`"content": "<\|image_1\|>\nWhat's in the image?"`
			`}]`
			`_assert_mm_data_is_image_input(await mm_future, 1)`


			`def test_parse_chat_messages_multiple_images(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_data = parse_chat_messages([{`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What's in these images?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [{`
			`"role":`
			`"user",`
			`"content":`
			`"<\|image_1\|>\n<\|image_2\|>\nWhat's in these images?"`
			`}]`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`_assert_mm_data_is_image_input(mm_data, 2)`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00

			`@pytest.mark.asyncio`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`async def test_parse_chat_messages_multiple_images_async(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_future = parse_chat_messages_futures([{`
			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What's in these images?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [{`
			`"role":`
			`"user",`
			`"content":`
			`"<\|image_1\|>\n<\|image_2\|>\nWhat's in these images?"`
			`}]`
			`_assert_mm_data_is_image_input(await mm_future, 2)`


			`def test_parse_chat_messages_placeholder_already_in_prompt(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_data = parse_chat_messages([{`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type":`
			`"text",`
			`"text":`
			`"What's in <\|image_1\|> and how does it compare to <\|image_2\|>?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [{`
			`"role":`
			`"user",`
			`"content":`
			`"What's in <\|image_1\|> and how does it compare to <\|image_2\|>?"`
			`}]`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`_assert_mm_data_is_image_input(mm_data, 2)`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00

[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`def test_parse_chat_messages_placeholder_one_already_in_prompt(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_data = parse_chat_messages([{`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type":`
			`"text",`
			`"text":`
			`"What's in <\|image_1\|> and how does it compare to the other one?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [{`
			`"role":`
			`"user",`
			`"content":`
			`"<\|image_2\|>\nWhat's in <\|image_1\|> and how does it compare to the "`
			`"other one?"`
			`}]`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`_assert_mm_data_is_image_input(mm_data, 2)`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00

[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`def test_parse_chat_messages_multiple_images_across_messages(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_data = parse_chat_messages([{`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What's in this image?"`
			`}]`
			`}, {`
			`"role": "assistant",`
			`"content": "Some stuff."`
			`}, {`
			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What about this one?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [`
			`{`
			`"role": "user",`
			`"content": "<\|image_1\|>\nWhat's in this image?"`
			`},`
			`{`
			`"role": "assistant",`
			`"content": "Some stuff."`
			`},`
			`{`
			`"role": "user",`
			`"content": "<\|image_2\|>\nWhat about this one?"`
			`},`
			`]`
[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`_assert_mm_data_is_image_input(mm_data, 2)`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00

[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`def test_parse_chat_messages_rejects_too_many_images_in_one_message(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`with warnings.catch_warnings():`
			`warnings.filterwarnings(`
			`"ignore",`
			`message="coroutine 'async_get_and_parse_image' was never awaited")`
			`with pytest.raises(`
			`ValueError,`
			`match="At most 2 image\\(s\\) may be provided in one request\\."`
			`):`
			`parse_chat_messages([{`
			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What's in these images?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`


[Frontend] Multimodal support in offline chat (#8098) 2024-09-04 13:22:17 +08:00			`def test_parse_chat_messages_rejects_too_many_images_across_messages(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
[Frontend][VLM] Add support for multiple multi-modal items (#8049) 2024-08-31 16:35:53 -07:00			`with warnings.catch_warnings():`
			`warnings.filterwarnings(`
			`"ignore",`
			`message="coroutine 'async_get_and_parse_image' was never awaited")`
			`with pytest.raises(`
			`ValueError,`
			`match="At most 2 image\\(s\\) may be provided in one request\\."`
			`):`
			`parse_chat_messages([{`
			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What's in this image?"`
			`}]`
			`}, {`
			`"role": "assistant",`
			`"content": "Some stuff."`
			`}, {`
			`"role":`
			`"user",`
			`"content": [{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`}, {`
			`"type": "text",`
			`"text": "What about these two?"`
			`}]`
			`}], phi3v_model_config, phi3v_tokenizer)`
[Frontend] Support simpler image input format (#9478) 2024-10-18 23:17:07 -07:00

			`def test_parse_chat_messages_multiple_images_uncommon_input(`
			`phi3v_model_config,`
			`phi3v_tokenizer,`
			`image_url,`
			`):`
			`conversation, mm_data = parse_chat_messages([{`
			`"role":`
			`"user",`
			`"content": [`
			`"What's in these images?", {`
			`"image_url": image_url`
			`}, {`
			`"image_url": image_url`
			`}`
			`]`
			`}], phi3v_model_config, phi3v_tokenizer)`

			`assert conversation == [{`
			`"role":`
			`"user",`
			`"content":`
			`"<\|image_1\|>\n<\|image_2\|>\nWhat's in these images?"`
			`}]`
			`_assert_mm_data_is_image_input(mm_data, 2)`