[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-04-10 14:19:42 +08:00 committed by GitHub
parent 3d4c87758e
commit a5d11a54dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 39 additions and 30 deletions

View File

@ -2046,27 +2046,31 @@ class LLMEngine:
*, *,
prompt_type: Literal["encoder", "decoder"], prompt_type: Literal["encoder", "decoder"],
): ):
if prompt_type == "encoder" and self.tokenizer is not None: model_config = self.model_config
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) tokenizer = (None if self.tokenizer is None else
model_config = self.model_config self.tokenizer.get_lora_tokenizer(lora_request))
if model_config.is_multimodal_model: prompt_ids = prompt_inputs["prompt_token_ids"]
if not prompt_ids:
if prompt_type == "encoder" and model_config.is_multimodal_model:
pass # Mllama may have empty encoder inputs for text-only data
else:
raise ValueError(f"The {prompt_type} prompt cannot be empty")
max_prompt_len = self.model_config.max_model_len
if len(prompt_ids) >= max_prompt_len:
if prompt_type == "encoder" and model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor( mm_processor = mm_registry.create_processor(
model_config, tokenizer=tokenizer) model_config,
tokenizer=tokenizer or object(), # Dummy if no tokenizer
)
assert isinstance(mm_processor, EncDecMultiModalProcessor) assert isinstance(mm_processor, EncDecMultiModalProcessor)
if mm_processor.pad_dummy_encoder_prompt: if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper return # Skip encoder length check for Whisper
prompt_ids = prompt_inputs["prompt_token_ids"] if model_config.is_multimodal_model:
if not prompt_ids:
raise ValueError(f"The {prompt_type} prompt cannot be empty")
max_prompt_len = self.model_config.max_model_len
if len(prompt_ids) >= max_prompt_len:
if self.model_config.is_multimodal_model:
suggestion = ( suggestion = (
"Make sure that `max_model_len` is no smaller than the " "Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image " "number of text tokens plus multimodal tokens. For image "

View File

@ -211,6 +211,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
# } # }
if mm_data: if mm_data:
hf_processor = self.info.get_hf_processor()
image_token: str = hf_processor.image_token
# Since only the last group of consecutive images # Since only the last group of consecutive images
# are attended by the decoded tokens, we only need to # are attended by the decoded tokens, we only need to
# get the number of tokens for those images. # get the number of tokens for those images.
@ -227,7 +230,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
num_tokens = decode_tiles * token_per_chunk num_tokens = decode_tiles * token_per_chunk
mm_inputs["encoder_prompt_token_ids"] = [image_token_id mm_inputs["encoder_prompt_token_ids"] = [image_token_id
] * num_tokens ] * num_tokens
mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens mm_inputs["encoder_prompt"] = image_token * num_tokens
return mm_inputs return mm_inputs

View File

@ -315,32 +315,34 @@ class Processor:
*, *,
prompt_type: Literal["encoder", "decoder"], prompt_type: Literal["encoder", "decoder"],
): ):
model_config = self.model_config
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
if prompt_type == "encoder":
model_config = self.model_config
if model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor(
model_config, tokenizer=tokenizer)
assert isinstance(mm_processor, EncDecMultiModalProcessor)
if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper
prompt_ids = prompt_inputs["prompt_token_ids"] prompt_ids = prompt_inputs["prompt_token_ids"]
if not prompt_ids: if not prompt_ids:
raise ValueError(f"The {prompt_type} prompt cannot be empty") if prompt_type == "encoder" and model_config.is_multimodal_model:
pass # Mllama may have empty encoder inputs for text-only data
else:
raise ValueError(f"The {prompt_type} prompt cannot be empty")
max_input_id = max(prompt_ids) max_input_id = max(prompt_ids, default=0)
if max_input_id > tokenizer.max_token_id: if max_input_id > tokenizer.max_token_id:
raise ValueError(f"Token id {max_input_id} is out of vocabulary") raise ValueError(f"Token id {max_input_id} is out of vocabulary")
max_prompt_len = self.model_config.max_model_len max_prompt_len = self.model_config.max_model_len
if len(prompt_ids) >= max_prompt_len: if len(prompt_ids) >= max_prompt_len:
if self.model_config.is_multimodal_model: if prompt_type == "encoder" and model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor(
model_config,
tokenizer=tokenizer,
)
assert isinstance(mm_processor, EncDecMultiModalProcessor)
if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper
if model_config.is_multimodal_model:
suggestion = ( suggestion = (
"Make sure that `max_model_len` is no smaller than the " "Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image " "number of text tokens plus multimodal tokens. For image "