diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3ac39887..54f7b8fb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2046,27 +2046,31 @@ class LLMEngine: *, prompt_type: Literal["encoder", "decoder"], ): - if prompt_type == "encoder" and self.tokenizer is not None: - tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) - model_config = self.model_config + model_config = self.model_config + tokenizer = (None if self.tokenizer is None else + self.tokenizer.get_lora_tokenizer(lora_request)) - if model_config.is_multimodal_model: + prompt_ids = prompt_inputs["prompt_token_ids"] + if not prompt_ids: + if prompt_type == "encoder" and model_config.is_multimodal_model: + pass # Mllama may have empty encoder inputs for text-only data + else: + raise ValueError(f"The {prompt_type} prompt cannot be empty") + + max_prompt_len = self.model_config.max_model_len + if len(prompt_ids) >= max_prompt_len: + if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( - model_config, tokenizer=tokenizer) + model_config, + tokenizer=tokenizer or object(), # Dummy if no tokenizer + ) assert isinstance(mm_processor, EncDecMultiModalProcessor) if mm_processor.pad_dummy_encoder_prompt: return # Skip encoder length check for Whisper - prompt_ids = prompt_inputs["prompt_token_ids"] - - if not prompt_ids: - raise ValueError(f"The {prompt_type} prompt cannot be empty") - - max_prompt_len = self.model_config.max_model_len - if len(prompt_ids) >= max_prompt_len: - if self.model_config.is_multimodal_model: + if model_config.is_multimodal_model: suggestion = ( "Make sure that `max_model_len` is no smaller than the " "number of text tokens plus multimodal tokens. For image " diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index a67339ca..d332b17f 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -211,6 +211,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] # } if mm_data: + hf_processor = self.info.get_hf_processor() + image_token: str = hf_processor.image_token + # Since only the last group of consecutive images # are attended by the decoded tokens, we only need to # get the number of tokens for those images. @@ -227,7 +230,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] num_tokens = decode_tiles * token_per_chunk mm_inputs["encoder_prompt_token_ids"] = [image_token_id ] * num_tokens - mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens + mm_inputs["encoder_prompt"] = image_token * num_tokens return mm_inputs diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 2525b10a..7d1913ec 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -315,32 +315,34 @@ class Processor: *, prompt_type: Literal["encoder", "decoder"], ): + model_config = self.model_config tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) - if prompt_type == "encoder": - model_config = self.model_config - - if model_config.is_multimodal_model: - mm_registry = self.input_preprocessor.mm_registry - mm_processor = mm_registry.create_processor( - model_config, tokenizer=tokenizer) - assert isinstance(mm_processor, EncDecMultiModalProcessor) - - if mm_processor.pad_dummy_encoder_prompt: - return # Skip encoder length check for Whisper - prompt_ids = prompt_inputs["prompt_token_ids"] - if not prompt_ids: - raise ValueError(f"The {prompt_type} prompt cannot be empty") + if prompt_type == "encoder" and model_config.is_multimodal_model: + pass # Mllama may have empty encoder inputs for text-only data + else: + raise ValueError(f"The {prompt_type} prompt cannot be empty") - max_input_id = max(prompt_ids) + max_input_id = max(prompt_ids, default=0) if max_input_id > tokenizer.max_token_id: raise ValueError(f"Token id {max_input_id} is out of vocabulary") max_prompt_len = self.model_config.max_model_len if len(prompt_ids) >= max_prompt_len: - if self.model_config.is_multimodal_model: + if prompt_type == "encoder" and model_config.is_multimodal_model: + mm_registry = self.input_preprocessor.mm_registry + mm_processor = mm_registry.create_processor( + model_config, + tokenizer=tokenizer, + ) + assert isinstance(mm_processor, EncDecMultiModalProcessor) + + if mm_processor.pad_dummy_encoder_prompt: + return # Skip encoder length check for Whisper + + if model_config.is_multimodal_model: suggestion = ( "Make sure that `max_model_len` is no smaller than the " "number of text tokens plus multimodal tokens. For image "