[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
Reza Salehi 2024-10-14 07:56:24 -07:00 committed by GitHub
parent 16b24e7dcd
commit dfe43a2071
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 1319 additions and 3 deletions

View File

@ -399,6 +399,12 @@ Text Generation
- :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
-
-
* - :code:`MolmoForCausalLM`
- Molmo
- Image
- :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
-
- ✅︎
* - :code:`NVLM_D_Model`
- NVLM-D 1.0
- Image\ :sup:`E+`

View File

@ -300,6 +300,23 @@ def run_mllama(question: str, modality: str):
return llm, prompt, stop_token_ids
# Molmo
def run_molmo(question, modality):
assert modality == "image"
model_name = "allenai/Molmo-7B-D-0924"
llm = LLM(
model=model_name,
trust_remote_code=True,
dtype="bfloat16",
)
prompt = question
stop_token_ids = None
return llm, prompt, stop_token_ids
# GLM-4v
def run_glm4v(question: str, modality: str):
assert modality == "image"
@ -331,6 +348,7 @@ model_example_map = {
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
"mllama": run_mllama,
"molmo": run_molmo,
"glm4v": run_glm4v,
}

View File

@ -163,6 +163,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|image|>"
if model_type == "qwen2_vl":
return "<|vision_start|><|image_pad|><|vision_end|>"
if model_type == "molmo":
return ""
raise TypeError(f"Unknown model type: {model_type}")
elif modality == "audio":

View File

@ -20,4 +20,4 @@ __all__ = [
"supports_multimodal",
"SupportsPP",
"supports_pp",
]
]

File diff suppressed because it is too large Load Diff

View File

@ -1167,8 +1167,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
continue
param = params_dict[name]
except KeyError:
print(params_dict.keys())
raise
raise ValueError(f"Unexpected weight: {name}") from None
weight_loader = getattr(param, "weight_loader",
default_weight_loader)

View File

@ -104,6 +104,7 @@ _MULTIMODAL_MODELS = {
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
"LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501
"MiniCPMV": ("minicpmv", "MiniCPMV"),
"MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
"NVLM_D": ("nvlm_d", "NVLM_D_Model"),
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),