[Bugfix] Fix validation error for text-only Mllama 3.2 (#16377)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
3d4c87758e
commit
a5d11a54dc
@ -2046,27 +2046,31 @@ class LLMEngine:
|
||||
*,
|
||||
prompt_type: Literal["encoder", "decoder"],
|
||||
):
|
||||
if prompt_type == "encoder" and self.tokenizer is not None:
|
||||
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
|
||||
model_config = self.model_config
|
||||
model_config = self.model_config
|
||||
tokenizer = (None if self.tokenizer is None else
|
||||
self.tokenizer.get_lora_tokenizer(lora_request))
|
||||
|
||||
if model_config.is_multimodal_model:
|
||||
prompt_ids = prompt_inputs["prompt_token_ids"]
|
||||
if not prompt_ids:
|
||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||
pass # Mllama may have empty encoder inputs for text-only data
|
||||
else:
|
||||
raise ValueError(f"The {prompt_type} prompt cannot be empty")
|
||||
|
||||
max_prompt_len = self.model_config.max_model_len
|
||||
if len(prompt_ids) >= max_prompt_len:
|
||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||
mm_registry = self.input_preprocessor.mm_registry
|
||||
mm_processor = mm_registry.create_processor(
|
||||
model_config, tokenizer=tokenizer)
|
||||
model_config,
|
||||
tokenizer=tokenizer or object(), # Dummy if no tokenizer
|
||||
)
|
||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||
|
||||
if mm_processor.pad_dummy_encoder_prompt:
|
||||
return # Skip encoder length check for Whisper
|
||||
|
||||
prompt_ids = prompt_inputs["prompt_token_ids"]
|
||||
|
||||
if not prompt_ids:
|
||||
raise ValueError(f"The {prompt_type} prompt cannot be empty")
|
||||
|
||||
max_prompt_len = self.model_config.max_model_len
|
||||
if len(prompt_ids) >= max_prompt_len:
|
||||
if self.model_config.is_multimodal_model:
|
||||
if model_config.is_multimodal_model:
|
||||
suggestion = (
|
||||
"Make sure that `max_model_len` is no smaller than the "
|
||||
"number of text tokens plus multimodal tokens. For image "
|
||||
|
@ -211,6 +211,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
# }
|
||||
|
||||
if mm_data:
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
image_token: str = hf_processor.image_token
|
||||
|
||||
# Since only the last group of consecutive images
|
||||
# are attended by the decoded tokens, we only need to
|
||||
# get the number of tokens for those images.
|
||||
@ -227,7 +230,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
num_tokens = decode_tiles * token_per_chunk
|
||||
mm_inputs["encoder_prompt_token_ids"] = [image_token_id
|
||||
] * num_tokens
|
||||
mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
|
||||
mm_inputs["encoder_prompt"] = image_token * num_tokens
|
||||
|
||||
return mm_inputs
|
||||
|
||||
|
@ -315,32 +315,34 @@ class Processor:
|
||||
*,
|
||||
prompt_type: Literal["encoder", "decoder"],
|
||||
):
|
||||
model_config = self.model_config
|
||||
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
|
||||
|
||||
if prompt_type == "encoder":
|
||||
model_config = self.model_config
|
||||
|
||||
if model_config.is_multimodal_model:
|
||||
mm_registry = self.input_preprocessor.mm_registry
|
||||
mm_processor = mm_registry.create_processor(
|
||||
model_config, tokenizer=tokenizer)
|
||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||
|
||||
if mm_processor.pad_dummy_encoder_prompt:
|
||||
return # Skip encoder length check for Whisper
|
||||
|
||||
prompt_ids = prompt_inputs["prompt_token_ids"]
|
||||
|
||||
if not prompt_ids:
|
||||
raise ValueError(f"The {prompt_type} prompt cannot be empty")
|
||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||
pass # Mllama may have empty encoder inputs for text-only data
|
||||
else:
|
||||
raise ValueError(f"The {prompt_type} prompt cannot be empty")
|
||||
|
||||
max_input_id = max(prompt_ids)
|
||||
max_input_id = max(prompt_ids, default=0)
|
||||
if max_input_id > tokenizer.max_token_id:
|
||||
raise ValueError(f"Token id {max_input_id} is out of vocabulary")
|
||||
|
||||
max_prompt_len = self.model_config.max_model_len
|
||||
if len(prompt_ids) >= max_prompt_len:
|
||||
if self.model_config.is_multimodal_model:
|
||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||
mm_registry = self.input_preprocessor.mm_registry
|
||||
mm_processor = mm_registry.create_processor(
|
||||
model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||
|
||||
if mm_processor.pad_dummy_encoder_prompt:
|
||||
return # Skip encoder length check for Whisper
|
||||
|
||||
if model_config.is_multimodal_model:
|
||||
suggestion = (
|
||||
"Make sure that `max_model_len` is no smaller than the "
|
||||
"number of text tokens plus multimodal tokens. For image "
|
||||
|
Loading…
x
Reference in New Issue
Block a user