[Model] New model support for Phi-4-multimodal-instruct (#14119)
This commit is contained in:
parent
ade3f7d988
commit
0a995d5434
@ -410,7 +410,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
- * `Phi3ForCausalLM`
|
||||
* Phi-4, Phi-3
|
||||
* `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
|
||||
* `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Phi3SmallForCausalLM`
|
||||
@ -856,6 +856,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Phi4MMForCausalLM`
|
||||
* Phi-4-multimodal
|
||||
* T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup>
|
||||
* `microsoft/Phi-4-multimodal-instruct`, etc.
|
||||
* ✅︎
|
||||
*
|
||||
*
|
||||
- * `PixtralForConditionalGeneration`
|
||||
* Pixtral
|
||||
* T + I<sup>+</sup>
|
||||
|
@ -37,3 +37,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
python-json-logger # Used by logging as per examples/other/logging_configuration.md
|
||||
scipy # Required for phi-4-multimodal-instruct
|
@ -272,6 +272,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
|
||||
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
|
||||
trust_remote_code=True),
|
||||
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
|
||||
trust_remote_code=True),
|
||||
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
|
||||
tokenizer_mode="mistral"),
|
||||
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
|
||||
|
@ -2284,9 +2284,9 @@ class LoRAConfig:
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self):
|
||||
# Setting the maximum rank to 256 should be able to satisfy the vast
|
||||
# Setting the maximum rank to 512 should be able to satisfy the vast
|
||||
# majority of applications.
|
||||
possible_max_ranks = (8, 16, 32, 64, 128, 256)
|
||||
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
|
||||
possible_lora_extra_vocab_size = (0, 256, 512)
|
||||
if self.max_lora_rank not in possible_max_ranks:
|
||||
raise ValueError(
|
||||
|
@ -395,6 +395,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
if model_type == "phi3_v":
|
||||
# Workaround since this token is not defined in the tokenizer
|
||||
return f"<|image_{current_count}|>"
|
||||
if model_type == "phi4mm":
|
||||
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
|
||||
if model_type in ("minicpmo", "minicpmv"):
|
||||
return "(<image>./</image>)"
|
||||
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
|
||||
@ -424,6 +426,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
elif modality == "audio":
|
||||
if model_type == "ultravox":
|
||||
return "<|audio|>"
|
||||
if model_type == "phi4mm":
|
||||
return "<|endoftext11|>" # 200011 (see vocab.json in hf model)
|
||||
if model_type == "qwen2_audio":
|
||||
return (f"Audio {current_count}: "
|
||||
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
|
||||
|
1803
vllm/model_executor/models/phi4mm.py
Normal file
1803
vllm/model_executor/models/phi4mm.py
Normal file
File diff suppressed because it is too large
Load Diff
1403
vllm/model_executor/models/phi4mm_audio.py
Normal file
1403
vllm/model_executor/models/phi4mm_audio.py
Normal file
File diff suppressed because it is too large
Load Diff
1969
vllm/model_executor/models/phi4mm_utils.py
Normal file
1969
vllm/model_executor/models/phi4mm_utils.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -182,6 +182,7 @@ _MULTIMODAL_MODELS = {
|
||||
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
|
||||
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
|
||||
# [Encoder-decoder]
|
||||
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
|
||||
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
|
||||
|
1966
vllm/model_executor/models/vision_siglip_navit.py
Normal file
1966
vllm/model_executor/models/vision_siglip_navit.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user