import warnings import pytest from PIL import Image from vllm.assets.image import ImageAsset from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import parse_chat_messages from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" @pytest.fixture(scope="module") def phi3v_model_config(): return ModelConfig(PHI3V_MODEL_ID, PHI3V_MODEL_ID, tokenizer_mode="auto", trust_remote_code=True, dtype="bfloat16", seed=0, limit_mm_per_prompt={ "image": 2, }) @pytest.fixture(scope="module") def phi3v_tokenizer(): return TokenizerGroup( tokenizer_id=PHI3V_MODEL_ID, enable_lora=False, max_num_seqs=5, max_input_length=None, ) @pytest.fixture(scope="module") def image_url(): image = ImageAsset('cherry_blossom') base64 = encode_image_base64(image.pil_image) return f"data:image/jpeg;base64,{base64}" @pytest.mark.asyncio async def test_parse_chat_messages_with_image_url(phi3v_model_config, phi3v_tokenizer, image_url): conversation, mm_future = parse_chat_messages([{ "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What's in the image?" }] }], phi3v_model_config, phi3v_tokenizer) assert conversation == [{ "role": "user", "content": "<|image_1|>\nWhat's in the image?" }] mm_data = await mm_future assert set(mm_data.keys()) == {"image"} assert isinstance(mm_data["image"], Image.Image) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images(phi3v_model_config, phi3v_tokenizer, image_url): conversation, mm_future = parse_chat_messages([{ "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What's in these images?" }] }], phi3v_model_config, phi3v_tokenizer) assert conversation == [{ "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?" }] mm_data = await mm_future assert set(mm_data.keys()) == {"image"} assert len(mm_data["image"]) == 2 @pytest.mark.asyncio async def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_model_config, phi3v_tokenizer, image_url): conversation, mm_future = parse_chat_messages([{ "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What's in <|image_1|> and how does it compare to <|image_2|>?" }] }], phi3v_model_config, phi3v_tokenizer) assert conversation == [{ "role": "user", "content": "What's in <|image_1|> and how does it compare to <|image_2|>?" }] mm_data = await mm_future assert set(mm_data.keys()) == {"image"} assert len(mm_data["image"]) == 2 @pytest.mark.asyncio async def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_model_config, phi3v_tokenizer, image_url): conversation, mm_future = parse_chat_messages([{ "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What's in <|image_1|> and how does it compare to the other one?" }] }], phi3v_model_config, phi3v_tokenizer) assert conversation == [{ "role": "user", "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the " "other one?" }] mm_data = await mm_future assert set(mm_data.keys()) == {"image"} assert len(mm_data["image"]) == 2 @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_across_messages( phi3v_model_config, phi3v_tokenizer, image_url): conversation, mm_future = parse_chat_messages([{ "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What's in this image?" }] }, { "role": "assistant", "content": "Some stuff." }, { "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What about this one?" }] }], phi3v_model_config, phi3v_tokenizer) assert conversation == [ { "role": "user", "content": "<|image_1|>\nWhat's in this image?" }, { "role": "assistant", "content": "Some stuff." }, { "role": "user", "content": "<|image_2|>\nWhat about this one?" }, ] mm_data = await mm_future assert set(mm_data.keys()) == {"image"} assert len(mm_data["image"]) == 2 @pytest.mark.asyncio async def test_parse_chat_messages_rejects_too_many_images_in_one_message( phi3v_model_config, phi3v_tokenizer, image_url): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="coroutine 'async_get_and_parse_image' was never awaited") with pytest.raises( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): parse_chat_messages([{ "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "image_url", "image_url": { "url": image_url } }, { "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What's in these images?" }] }], phi3v_model_config, phi3v_tokenizer) @pytest.mark.asyncio async def test_parse_chat_messages_rejects_too_many_images_across_messages( phi3v_model_config, phi3v_tokenizer, image_url): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="coroutine 'async_get_and_parse_image' was never awaited") with pytest.raises( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): parse_chat_messages([{ "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What's in this image?" }] }, { "role": "assistant", "content": "Some stuff." }, { "role": "user", "content": [{ "type": "image_url", "image_url": { "url": image_url } }, { "type": "image_url", "image_url": { "url": image_url } }, { "type": "text", "text": "What about these two?" }] }], phi3v_model_config, phi3v_tokenizer)