from functools import partial from typing import List, Optional, Tuple, Type import pytest from PIL import Image from vllm.inputs.data import ExplicitEncoderDecoderPrompt from vllm.sequence import SampleLogprobs from ....conftest import HfRunner, VllmRunner from ...utils import check_logprobs_close Florence2Prompt = partial(ExplicitEncoderDecoderPrompt, decoder_prompt=None, mm_processor_kwargs=None) MODELS = ["microsoft/Florence-2-base"] # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model TOKENIZER = "facebook/bart-base" PROMPTS = [ Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), Florence2Prompt(encoder_prompt=""), ] def vllm_to_hf_output(vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], ): """Sanitize vllm output to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output hf_output_str = "" + output_str + "" return output_ids, hf_output_str, out_logprobs def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], prompts: List[ExplicitEncoderDecoderPrompt], model: str, *, dtype: str, max_tokens: int, num_logprobs: int, tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, ) -> None: with vllm_runner(model, tokenizer_name=TOKENIZER, dtype=dtype, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True) as vllm_model: vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( prompts, max_tokens, num_logprobs) # Florence-2 processors require image inputs dummy_image = Image.new(mode="RGB", size=(2, 2)) with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model: hf_model.model.get_output_embeddings = lambda: \ hf_model.model.language_model.lm_head hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( prompts, max_tokens, num_logprobs, images=[dummy_image] * len(prompts), )) check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=[ vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs ], name_0="hf", name_1="vllm", ) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_models(hf_runner, vllm_runner, model, dtype, max_tokens, num_logprobs) -> None: run_test( hf_runner, vllm_runner, PROMPTS, model, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, tensor_parallel_size=1, )