vllm/examples/offline_inference/vision_language.py

# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple, Optional

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer

from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser


class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


# Aria
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
        limit_mm_per_prompt={"image": 1},
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
        model="Salesforce/blip2-opt-6.7b",
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Chameleon
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"{question}<image>" for question in questions]
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Deepseek-VL2
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "deepseek-ai/deepseek-vl2-tiny"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Florence2
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
        tokenizer="facebook/bart-large",
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": 1},
    )

    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Fuyu
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"{question}\n" for question in questions]
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Gemma 3
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# GLM-4v
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>" for question in questions
    ]

    stop_token_ids = [151329, 151336, 151338]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "h2oai/h2ovl-mississippi-800m"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {
                "longest_edge": 3 * 364
            },
        },
        limit_mm_per_prompt={"image": 1},
    )
    prompts = [(
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
    ) for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {
                "longest_edge": 384
            },
        },
        limit_mm_per_prompt={"image": 1},
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "OpenGVLab/InternVL2-2B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]

    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LLaVA-1.6/LLaVA-NeXT
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
    assert modality == "video"

    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LLaVA-OneVision
def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:

    if modality == "video":
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]

    elif modality == "image":
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
        ]

    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Mantis
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
    prompts = [
        llama3_template.format(f"{question}\n<image>")
        for question in questions
    ]

    engine_args = EngineArgs(
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
        limit_mm_per_prompt={"image": 1},
    )
    stop_token_ids = [128009]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# MiniCPM-V
def run_minicpmv_base(questions: list[str], modality: str, model_name):
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

    # 2.6
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": 1},
    )
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

    # 2.6 / o2.6
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

    prompts = [
        tokenizer.apply_chat_template(
            [{
                'role': 'user',
                'content': f"{modality_placeholder[modality]}\n{question}"
            }],
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")


def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")


# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LLama 3.2
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # Note: The default setting of max_num_seqs (256) and
    # max_model_len (131072) for this model may cause OOM.
    # You may lower either to run this example on lower-end GPUs.

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": question
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [[{
        "role":
        "user",
        "content": [{
            "type": "image"
        }, {
            "type": "text",
            "text": f"{question}"
        }]
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# Molmo
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "allenai/Molmo-7B-D-0924"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# NVLM-D
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# PaliGemma
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma has special prompt format for VQA
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# PaliGemma 2
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma 2 has special prompt format for VQA
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Phi-3-Vision
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]

    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"num_crops": 16},
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Phi-4-multimodal-instruct
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )


# Pixtral HF-format
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=6144,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    engine_args = EngineArgs(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        limit_mm_per_prompt={"image": 1},
    )

    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen2-VL
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
        },
        limit_mm_per_prompt={"image": 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen2.5-VL
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        limit_mm_per_prompt={"image": 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    messages = [[{
        'role': 'user',
        'content': f"<image>\n{question}"
    }] for question in questions]
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


model_example_map = {
    "aria": run_aria,
    "aya_vision": run_aya_vision,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
    "deepseek_vl_v2": run_deepseek_vl2,
    "florence2": run_florence2,
    "fuyu": run_fuyu,
    "gemma3": run_gemma3,
    "glm4v": run_glm4v,
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
    "llava-onevision": run_llava_onevision,
    "mantis": run_mantis,
    "minicpmo": run_minicpmo,
    "minicpmv": run_minicpmv,
    "mistral3": run_mistral3,
    "mllama": run_mllama,
    "llama4": run_llama4,
    "molmo": run_molmo,
    "NVLM_D": run_nvlm_d,
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
    "phi4_mm": run_phi4mm,
    "pixtral_hf": run_pixtral_hf,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
    "qwen2_5_vl": run_qwen2_5_vl,
    "skywork_chat": run_skyworkr1v,
    "smolvlm": run_smolvlm,
}


def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = ImageAsset("cherry_blossom") \
            .pil_image.convert("RGB")
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]

        return {
            "data": image,
            "questions": img_questions,
        }

    if args.modality == "video":
        # Input video and question
        video = VideoAsset(name="sample_demo_1.mp4",
                           num_frames=args.num_frames).np_ndarrays
        vid_questions = ["Why is this video funny?"]

        return {
            "data": video,
            "questions": vid_questions,
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


def apply_image_repeat(image_repeat_prob, num_prompts, data,
                       prompts: list[str], modality):
    """Repeats images with provided probability of "image_repeat_prob". 
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        inputs.append({
            "prompt": prompts[i % len(prompts)],
            "multi_modal_data": {
                modality: cur_image
            }
        })

    return inputs


@contextmanager
def time_counter(enable: bool):
    if enable:
        import time
        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
    questions = mm_input["questions"]

    req_data = model_example_map[model](questions, modality)

    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
    }
    llm = LLM(**engine_args)

    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)

    # Don't want to check the flag multiple times, so just hijack `prompts`.
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
        req_data.prompts[0]
    ]

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
                                     stop_token_ids=req_data.stop_token_ids)

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        inputs = {
            "prompt": prompts[0],
            "multi_modal_data": {
                modality: data
            },
        }
    else:
        # Batch inference
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs = apply_image_repeat(args.image_repeat_prob,
                                        args.num_prompts, data, prompts,
                                        modality)
        else:
            # Use the same image for all prompts
            inputs = [{
                "prompt": prompts[i % len(prompts)],
                "multi_modal_data": {
                    modality: data
                },
            } for i in range(args.num_prompts)]

    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)

    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)


if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'vision language models for text generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,
                        default="llava",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument('--num-prompts',
                        type=int,
                        default=4,
                        help='Number of prompts to run.')
    parser.add_argument('--modality',
                        type=str,
                        default="image",
                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")

    parser.add_argument(
        '--image-repeat-prob',
        type=float,
        default=None,
        help='Simulates the hit-ratio for multi-modal preprocessor cache'
        ' (if enabled)')

    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
        help='If True, disables caching of multi-modal preprocessor/mapper.')

    parser.add_argument(
        '--time-generate',
        action='store_true',
        help='If True, then print the total generate() call time')

    parser.add_argument(
        '--use-different-prompt-per-request',
        action='store_true',
        help='If True, then use different prompt (with the same multi-modal '
        'data) for each request.')

    args = parser.parse_args()
    main(args)
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
+								# SPDX-License-Identifier: Apache-2.0
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								"""
-												[Model] Support E5-V (#9576)


											
										
										
											2024-10-23 11:35:29 +08:00
+								This example shows how to use vLLM for running offline inference with
 								the correct prompt format on vision language models for text generation.
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								For most models, the prompt format should follow corresponding examples
 								on HuggingFace model repository.
 								"""
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								import os
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								import random
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								from contextlib import contextmanager
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								from dataclasses import asdict
 								from typing import NamedTuple, Optional
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								from huggingface_hub import snapshot_download
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								from transformers import AutoTokenizer
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								from vllm import LLM, EngineArgs, SamplingParams
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								from vllm.assets.image import ImageAsset
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								from vllm.assets.video import VideoAsset
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								from vllm.lora.request import LoRARequest
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								from vllm.utils import FlexibleArgumentParser
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								class ModelRequestData(NamedTuple):
 								    engine_args: EngineArgs
 								    prompts: list[str]
 								    stop_token_ids: Optional[list[int]] = None
 								    lora_requests: Optional[list[LoRARequest]] = None
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 								# lower-end GPUs.
 								# Unless specified, these settings have been tested to work on a single L4.
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Aria
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_aria(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    model_name = "rhymes-ai/Aria"
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								    # NOTE: Need L40 (or equivalent) to avoid OOM
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        dtype="bfloat16",
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
 								                "<|im_end|>\n<|im_start|>assistant\n")
 								               for question in questions]
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								# Aya Vision
 								def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "CohereForAI/aya-vision-8b"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=2048,
 								        max_num_seqs=2,
 								        mm_processor_kwargs={"crop_to_patches": True},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								    )
 								    prompts = [
 								        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# BLIP-2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
 								    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"Question: {question} Answer:" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-01 00:01:35 +08:00
+								        model="Salesforce/blip2-opt-6.7b",
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								# Chameleon
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"{question}<image>" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="facebook/chameleon-7b",
 								        max_model_len=4096,
 								        max_num_seqs=2,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								# Deepseek-VL2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								    assert modality == "image"
-												[Model] Add support for deepseek-vl2-tiny model (#12068)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-01-17 01:14:48 +08:00
+								    model_name = "deepseek-ai/deepseek-vl2-tiny"
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
 								        for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												[VLM] Support multimodal inputs for Florence-2 models (#13320)


											
										
										
											2025-02-27 18:06:41 +08:00
+								# Florence2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
-												[VLM] Support multimodal inputs for Florence-2 models (#13320)


											
										
										
											2025-02-27 18:06:41 +08:00
+								    assert modality == "image"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="microsoft/Florence-2-large",
 								        tokenizer="facebook/bart-large",
-												[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-01 00:01:35 +08:00
+								        max_model_len=4096,
 								        max_num_seqs=2,
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        trust_remote_code=True,
 								        dtype="bfloat16",
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[VLM] Support multimodal inputs for Florence-2 models (#13320)


											
										
										
											2025-02-27 18:06:41 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[VLM] Support multimodal inputs for Florence-2 models (#13320)


											
										
										
											2025-02-27 18:06:41 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Fuyu
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"{question}\n" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="adept/fuyu-8b",
 								        max_model_len=2048,
 								        max_num_seqs=2,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								# Gemma 3
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								    assert modality == "image"
 								    model_name = "google/gemma-3-4b-it"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-03-13 17:23:12 +08:00
+								        model=model_name,
 								        max_model_len=2048,
 								        max_num_seqs=2,
 								        mm_processor_kwargs={"do_pan_and_scan": True},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-03-13 17:23:12 +08:00
+								    )
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
 								    prompts = [("<bos><start_of_turn>user\n"
 								                f"<start_of_image>{question}<end_of_turn>\n"
 								                "<start_of_turn>model\n") for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# GLM-4v
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    model_name = "THUDM/glm-4v-9b"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=2048,
 								        max_num_seqs=2,
 								        trust_remote_code=True,
 								        enforce_eager=True,
 								        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[VLM] Separate text-only and vision variants of the same model architecture (#13157)


											
										
										
											2025-02-13 22:19:15 +08:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
 								        {question}<|assistant|>" for question in questions
 								    ]
-												[VLM] Merged multi-modal processor for GLM4V (#12449)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-02-09 04:32:16 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    stop_token_ids = [151329, 151336, 151338]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								# H2OVL-Mississippi
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
-												[V1] Update doc and examples for H2O-VL (#13349)

Signed-off-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-02-16 02:35:54 -08:00
+								    model_name = "h2oai/h2ovl-mississippi-800m"
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name,
 								                                              trust_remote_code=True)
-												[Bugfix] Fix broken vision language example (#14292)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-03-05 23:57:10 +08:00
+								    messages = [[{
 								        'role': 'user',
 								        'content': f"<image>\n{question}"
 								    }] for question in questions]
 								    prompts = tokenizer.apply_chat_template(messages,
 								                                            tokenize=False,
 								                                            add_generation_prompt=True)
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    # Stop tokens for H2OVL-Mississippi
-												[V1] Update doc and examples for H2O-VL (#13349)

Signed-off-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-02-16 02:35:54 -08:00
+								    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    stop_token_ids = [tokenizer.eos_token_id]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								# Idefics3-8B-Llama3
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        enforce_eager=True,
 								        # if you are running out of memory, you can reduce the "longest_edge".
 								        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
 								        mm_processor_kwargs={
 								            "size": {
 								                "longest_edge": 3 * 364
 								            },
 								        },
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    )
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ) for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								# SmolVLM2-2.2B-Instruct
 								def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        enforce_eager=True,
 								        mm_processor_kwargs={
 								            "max_image_size": {
 								                "longest_edge": 384
 								            },
 								        },
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								    )
 								    prompts = [
 								        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# InternVL
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    model_name = "OpenGVLab/InternVL2-2B"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name,
 								                                              trust_remote_code=True)
-												[Bugfix] Fix broken vision language example (#14292)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-03-05 23:57:10 +08:00
+								    messages = [[{
 								        'role': 'user',
 								        'content': f"<image>\n{question}"
 								    }] for question in questions]
 								    prompts = tokenizer.apply_chat_template(messages,
 								                                            tokenize=False,
 								                                            add_generation_prompt=True)
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    # Stop tokens for InternVL
 								    # models variants may have different stop tokens
 								    # please refer to the model card for the correct "stop words":
 								    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
 								    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								# LLaVA-1.5
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_llava(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
 								    ]
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/llava-1.5-7b-hf",
 								        max_model_len=4096,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								# LLaVA-1.6/LLaVA-NeXT
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/llava-v1.6-mistral-7b-hf",
 								        max_model_len=8192,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
 								# LlaVA-NeXT-Video
 								# Currently only support for video input
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_llava_next_video(questions: list[str],
 								                         modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "video"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        f"USER: <video>\n{question} ASSISTANT:" for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
 								        max_model_len=8192,
-												[Misc] Clean up MiniCPM-V/O code (#15337)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-25 18:22:52 +08:00
+								        max_num_seqs=2,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								# LLaVA-OneVision
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_llava_onevision(questions: list[str],
 								                        modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
 								    if modality == "video":
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        prompts = [
 								            f"<|im_start|>user <video>\n{question}<|im_end|> \
 								        <|im_start|>assistant\n" for question in questions
 								        ]
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
 								    elif modality == "image":
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        prompts = [
 								            f"<|im_start|>user <image>\n{question}<|im_end|> \
 								        <|im_start|>assistant\n" for question in questions
 								        ]
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
 								        max_model_len=16384,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Mantis
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        llama3_template.format(f"{question}\n<image>")
 								        for question in questions
 								    ]
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model="TIGER-Lab/Mantis-8B-siglip-llama3",
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								        max_model_len=4096,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    stop_token_ids = [128009]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								# MiniCPM-V
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								def run_minicpmv_base(questions: list[str], modality: str, model_name):
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    assert modality in ["image", "video"]
 								    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								    # 2.0
 								    # The official repo doesn't work yet, so we need to use a fork for now
 								    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
 								    # model_name = "HwwwH/MiniCPM-V-2"
 								    # 2.5
-												[Model] Rename MiniCPMVQwen2 to MiniCPMV2.6 (#7273)


											
										
										
											2024-08-08 22:02:41 +08:00
+								    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								    # 2.6
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    # model_name = "openbmb/MiniCPM-V-2_6"
 								    # o2.6
 								    # modality supports
 								    # 2.0: image
 								    # 2.5: image
 								    # 2.6: image, video
 								    # o2.6: image, video, audio
 								    # model_name = "openbmb/MiniCPM-o-2_6"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name,
 								                                              trust_remote_code=True)
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								        model=model_name,
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								        max_model_len=4096,
 								        max_num_seqs=2,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								        trust_remote_code=True,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    )
-												[Model] Rename MiniCPMVQwen2 to MiniCPMV2.6 (#7273)


											
										
										
											2024-08-08 22:02:41 +08:00
+								    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
 								    # 2.0
 								    # stop_token_ids = [tokenizer.eos_id]
 								    # 2.5
 								    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    # 2.6 / o2.6
-												[Model] Rename MiniCPMVQwen2 to MiniCPMV2.6 (#7273)


											
										
										
											2024-08-08 22:02:41 +08:00
+								    stop_tokens = ['<|im_end|>', '<|endoftext|>']
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    modality_placeholder = {
 								        "image": "(<image>./</image>)",
 								        "video": "(<video>./</video>)",
 								    }
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        tokenizer.apply_chat_template(
 								            [{
 								                'role': 'user',
 								                'content': f"{modality_placeholder[modality]}\n{question}"
 								            }],
 								            tokenize=False,
 								            add_generation_prompt=True) for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								# Mistral-3 HF-format
 								def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 								    # NOTE: Need L40 (or equivalent) to avoid OOM
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        tensor_parallel_size=2,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								    )
 								    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# LLama 3.2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
+								    assert modality == "image"
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # Note: The default setting of max_num_seqs (256) and
 								    # max_model_len (131072) for this model may cause OOM.
 								    # You may lower either to run this example on lower-end GPUs.
 								    # The configuration below has been confirmed to launch on a single L40 GPU.
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
+								        model=model_name,
-												[Bugfix] Proper input validation for multi-modal encoder-decoder models (#16156)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-09 00:45:21 +08:00
+								        max_model_len=8192,
-												[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-01 00:01:35 +08:00
+								        max_num_seqs=2,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
+								    )
-												[Doc] Update mllama example based on official doc (#11567)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
											
										
										
											2024-12-28 08:31:10 +08:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name)
-												[Bugfix] Fix broken vision language example (#14292)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-03-05 23:57:10 +08:00
+								    messages = [[{
-												[Doc] Update mllama example based on official doc (#11567)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
											
										
										
											2024-12-28 08:31:10 +08:00
+								        "role":
 								        "user",
 								        "content": [{
 								            "type": "image"
 								        }, {
 								            "type": "text",
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								            "text": question
-												[Doc] Update mllama example based on official doc (#11567)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
											
										
										
											2024-12-28 08:31:10 +08:00
+								        }]
-												[Bugfix] Fix broken vision language example (#14292)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-03-05 23:57:10 +08:00
+								    }] for question in questions]
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = tokenizer.apply_chat_template(messages,
 								                                            add_generation_prompt=True,
 								                                            tokenize=False)
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								    assert modality == "image"
 								    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=4,
 								        tensor_parallel_size=8,
 								        gpu_memory_utilization=0.4,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name)
 								    messages = [[{
 								        "role":
 								        "user",
 								        "content": [{
 								            "type": "image"
 								        }, {
 								            "type": "text",
 								            "text": f"{question}"
 								        }]
 								    }] for question in questions]
 								    prompts = tokenizer.apply_chat_template(messages,
 								                                            add_generation_prompt=True,
 								                                            tokenize=False)
 								    stop_token_ids = None
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Molmo
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    model_name = "allenai/Molmo-7B-D-0924"
-												[VLM][Doc] Add `stop_token_ids` to InternVL example (#7354)


											
										
										
											2024-08-09 22:51:04 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM][Doc] Add `stop_token_ids` to InternVL example (#7354)


											
										
										
											2024-08-09 22:51:04 +08:00
+								        model=model_name,
-												[Model] Initialize support for InternVL2 series models (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-07-29 18:16:30 +08:00
+								        trust_remote_code=True,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        dtype="bfloat16",
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Initialize support for InternVL2 series models (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-07-29 18:16:30 +08:00
+								    )
-												[VLM][Doc] Add `stop_token_ids` to InternVL example (#7354)


											
										
										
											2024-08-09 22:51:04 +08:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        f"<|im_start|>user <image>\n{question}<|im_end|> \
 								        <|im_start|>assistant\n" for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Initialize support for InternVL2 series models (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-07-29 18:16:30 +08:00
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								# NVLM-D
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    assert modality == "image"
 								    model_name = "nvidia/NVLM-D-72B"
 								    # Adjust this as necessary to fit in GPU
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        tensor_parallel_size=4,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name,
 								                                              trust_remote_code=True)
-												[Bugfix] Fix broken vision language example (#14292)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-03-05 23:57:10 +08:00
+								    messages = [[{
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        'role': 'user',
 								        'content': f"<image>\n{question}"
-												[Bugfix] Fix broken vision language example (#14292)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-03-05 23:57:10 +08:00
+								    }] for question in questions]
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = tokenizer.apply_chat_template(messages,
 								                                            tokenize=False,
 								                                            add_generation_prompt=True)
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# PaliGemma
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Model] Initial support for BLIP-2 (#5920)

Co-authored-by: ywang96 <ywang@roblox.com>
											
										
										
											2024-07-27 19:53:07 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # PaliGemma has special prompt format for VQA
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    prompts = ["caption en" for _ in questions]
 								    engine_args = EngineArgs(
 								        model="google/paligemma-3b-mix-224",
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
 								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Initial support for BLIP-2 (#5920)

Co-authored-by: ywang96 <ywang@roblox.com>
											
										
										
											2024-07-27 19:53:07 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# PaliGemma 2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-05 06:48:10 -06:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # PaliGemma 2 has special prompt format for VQA
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    prompts = ["caption en" for _ in questions]
 								    engine_args = EngineArgs(
 								        model="google/paligemma2-3b-ft-docci-448",
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
 								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-05 06:48:10 -06:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Phi-3-Vision
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 								        for question in questions
 								    ]
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # num_crops is an override kwarg to the multimodal image processor;
 								    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
 								    # to use 16 for single frame scenarios, and 4 for multi-frame.
 								    #
 								    # Generally speaking, a larger value for num_crops results in more
 								    # tokens per image instance, because it may scale the image more in
 								    # the image preprocessing. Some references in the model docs and the
 								    # formula for image tokens after the preprocessing
 								    # transform can be found below.
 								    #
 								    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
 								    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model="microsoft/Phi-3.5-vision-instruct",
 								        trust_remote_code=True,
-												[CI/Build] Add Model Tests for Qwen2-VL (#9846)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-10-31 10:10:52 -06:00
+								        max_model_len=4096,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        max_num_seqs=2,
-												[Model] Add min_pixels / max_pixels to Qwen2VL as mm_processor_kwargs (#9612)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

											
										
										
											2024-10-23 08:05:18 -06:00
+								        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        mm_processor_kwargs={"num_crops": 16},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								# Phi-4-multimodal-instruct
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    """
 								    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
 								    show how to process image inputs.
 								    """
 								    assert modality == "image"
 								    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 								    # Since the vision-lora and speech-lora co-exist with the base model,
 								    # we have to manually specify the path of the lora weights.
 								    vision_lora_path = os.path.join(model_path, "vision-lora")
 								    prompts = [
 								        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
 								        for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								        model=model_path,
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        enable_lora=True,
 								        max_lora_rank=320,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
 								    )
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								# Pixtral HF-format
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								    assert modality == "image"
 								    model_name = "mistral-community/pixtral-12b"
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								    # NOTE: Need L40 (or equivalent) to avoid OOM
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								        model=model_name,
-												[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-01 00:01:35 +08:00
+								        max_model_len=6144,
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								        max_num_seqs=2,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								    )
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Qwen
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
+								    assert modality == "image"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model="Qwen/Qwen-VL",
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
+								        trust_remote_code=True,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        max_model_len=1024,
 								        max_num_seqs=2,
-												[VLM] Separate text-only and vision variants of the same model architecture (#13157)


											
										
										
											2025-02-13 22:19:15 +08:00
+								        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
+								    )
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Qwen2-VL
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Add GLM-4v support and meet vllm==0.6.2  (#9242)


											
										
										
											2024-10-12 01:36:13 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    model_name = "Qwen/Qwen2-VL-7B-Instruct"
-												[Model] Add Idefics3 support (#9767)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: B-201 <Joy25810@foxmail.com>
Co-authored-by: B-201 <Joy25810@foxmail.com>
											
										
										
											2024-11-06 19:41:17 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								        model=model_name,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        max_model_len=4096,
 								        max_num_seqs=5,
 								        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								        mm_processor_kwargs={
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								            "min_pixels": 28 * 28,
 								            "max_pixels": 1280 * 28 * 28,
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								        },
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								    )
-												[Model] Add Idefics3 support (#9767)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: B-201 <Joy25810@foxmail.com>
Co-authored-by: B-201 <Joy25810@foxmail.com>
											
										
										
											2024-11-06 19:41:17 +08:00
-												[Model] Refactor Qwen2-VL to use merged multimodal processor (#11258)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-12-20 00:28:00 +08:00
+								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								         f"{question}<|im_end|>\n"
 								         "<|im_start|>assistant\n") for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Update multi-modal processor to support Mantis(LLaVA) model (#10711)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-12-08 01:10:05 +08:00
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								# Qwen2.5-VL
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
 								    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=5,
 								        mm_processor_kwargs={
 								            "min_pixels": 28 * 28,
 								            "max_pixels": 1280 * 28 * 28,
 								            "fps": 1,
 								        },
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    )
 								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								         f"{question}<|im_end|>\n"
 								         "<|im_start|>assistant\n") for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
+								# SkyworkR1V
 								def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "Skywork/Skywork-R1V-38B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								        limit_mm_per_prompt={"image": 1},
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
+								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name,
 								                                              trust_remote_code=True)
 								    messages = [[{
 								        'role': 'user',
 								        'content': f"<image>\n{question}"
 								    }] for question in questions]
 								    prompts = tokenizer.apply_chat_template(messages,
 								                                            tokenize=False,
 								                                            add_generation_prompt=True)
 								    # Stop tokens for SkyworkR1V
 								    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
 								    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								model_example_map = {
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "aria": run_aria,
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								    "aya_vision": run_aya_vision,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "blip-2": run_blip2,
 								    "chameleon": run_chameleon,
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								    "deepseek_vl_v2": run_deepseek_vl2,
-												[VLM] Support multimodal inputs for Florence-2 models (#13320)


											
										
										
											2025-02-27 18:06:41 +08:00
+								    "florence2": run_florence2,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "fuyu": run_fuyu,
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								    "gemma3": run_gemma3,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "glm4v": run_glm4v,
 								    "h2ovl_chat": run_h2ovl,
 								    "idefics3": run_idefics3,
 								    "internvl_chat": run_internvl,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    "llava": run_llava,
 								    "llava-next": run_llava_next,
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								    "llava-next-video": run_llava_next_video,
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    "llava-onevision": run_llava_onevision,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "mantis": run_mantis,
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    "minicpmo": run_minicpmo,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    "minicpmv": run_minicpmv,
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								    "mistral3": run_mistral3,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "mllama": run_mllama,
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								    "llama4": run_llama4,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "molmo": run_molmo,
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    "NVLM_D": run_nvlm_d,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "paligemma": run_paligemma,
 								    "paligemma2": run_paligemma2,
 								    "phi3_v": run_phi3v,
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    "phi4_mm": run_phi4mm,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "pixtral_hf": run_pixtral_hf,
-												[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-05 06:48:10 -06:00
+								    "qwen_vl": run_qwen_vl,
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								    "qwen2_vl": run_qwen2_vl,
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    "qwen2_5_vl": run_qwen2_5_vl,
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
+								    "skywork_chat": run_skyworkr1v,
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								    "smolvlm": run_smolvlm,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								}
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								def get_multi_modal_input(args):
 								    """
 								    return {
 								        "data": image or video,
 								        "question": question,
 								    }
 								    """
 								    if args.modality == "image":
 								        # Input image and question
 								        image = ImageAsset("cherry_blossom") \
 								            .pil_image.convert("RGB")
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        img_questions = [
 								            "What is the content of this image?",
 								            "Describe the content of this image in detail.",
 								            "What's in the image?",
 								            "Where is this image taken?",
 								        ]
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
 								        return {
 								            "data": image,
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            "questions": img_questions,
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								        }
 								    if args.modality == "video":
 								        # Input video and question
 								        video = VideoAsset(name="sample_demo_1.mp4",
 								                           num_frames=args.num_frames).np_ndarrays
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        vid_questions = ["Why is this video funny?"]
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
 								        return {
 								            "data": video,
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            "questions": vid_questions,
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								        }
 								    msg = f"Modality {args.modality} is not supported."
 								    raise ValueError(msg)
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								def apply_image_repeat(image_repeat_prob, num_prompts, data,
 								                       prompts: list[str], modality):
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								    """Repeats images with provided probability of "image_repeat_prob".
 								    Used to simulate hit/miss for the MM preprocessor cache.
 								    """
 								    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
 								    no_yes = [0, 1]
 								    probs = [1.0 - image_repeat_prob, image_repeat_prob]
 								    inputs = []
 								    cur_image = data
 								    for i in range(num_prompts):
 								        if image_repeat_prob is not None:
 								            res = random.choices(no_yes, probs)[0]
 								            if res == 0:
 								                # No repeat => Modify one pixel
 								                cur_image = cur_image.copy()
 								                new_val = (i // 256 // 256, i // 256, i % 256)
 								                cur_image.putpixel((0, 0), new_val)
 								        inputs.append({
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            "prompt": prompts[i % len(prompts)],
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								            "multi_modal_data": {
 								                modality: cur_image
 								            }
 								        })
 								    return inputs
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								@contextmanager
 								def time_counter(enable: bool):
 								    if enable:
 								        import time
 								        start_time = time.time()
 								        yield
 								        elapsed_time = time.time() - start_time
 								        print("-" * 50)
 								        print("-- generate time = {}".format(elapsed_time))
 								        print("-" * 50)
 								    else:
 								        yield
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								def main(args):
 								    model = args.model_type
 								    if model not in model_example_map:
 								        raise ValueError(f"Model type {model} is not supported.")
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								    modality = args.modality
 								    mm_input = get_multi_modal_input(args)
 								    data = mm_input["data"]
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    questions = mm_input["questions"]
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    req_data = model_example_map[model](questions, modality)
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								    # Disable other modalities to save memory
 								    default_limits = {"image": 0, "video": 0, "audio": 0}
 								    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
 								        req_data.engine_args.limit_mm_per_prompt or {})
 								    engine_args = asdict(req_data.engine_args) | {
 								        "seed": args.seed,
 								        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
 								    }
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    llm = LLM(**engine_args)
 								    # To maintain code compatibility in this script, we add LoRA here.
 								    # You can also add LoRA using:
 								    # llm.generate(prompts, lora_request=lora_request,...)
 								    if req_data.lora_requests:
 								        for lora_request in req_data.lora_requests:
 								            llm.llm_engine.add_lora(lora_request=lora_request)
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    # Don't want to check the flag multiple times, so just hijack `prompts`.
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    prompts = req_data.prompts if args.use_different_prompt_per_request else [
 								        req_data.prompts[0]
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ]
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								    # We set temperature to 0.2 so that outputs can be different
 								    # even when all prompts are identical when running batch inference.
-												[Model] Rename MiniCPMVQwen2 to MiniCPMV2.6 (#7273)


											
										
										
											2024-08-08 22:02:41 +08:00
+								    sampling_params = SamplingParams(temperature=0.2,
 								                                     max_tokens=64,
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								                                     stop_token_ids=req_data.stop_token_ids)
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								    assert args.num_prompts > 0
 								    if args.num_prompts == 1:
 								        # Single inference
 								        inputs = {
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            "prompt": prompts[0],
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								            "multi_modal_data": {
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								                modality: data
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								            },
 								        }
 								    else:
 								        # Batch inference
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								        if args.image_repeat_prob is not None:
 								            # Repeat images with specified probability of "image_repeat_prob"
 								            inputs = apply_image_repeat(args.image_repeat_prob,
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								                                        args.num_prompts, data, prompts,
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								                                        modality)
 								        else:
 								            # Use the same image for all prompts
 								            inputs = [{
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								                "prompt": prompts[i % len(prompts)],
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								                "multi_modal_data": {
 								                    modality: data
 								                },
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            } for i in range(args.num_prompts)]
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								    # Add LoRA request if applicable
 								    lora_request = (req_data.lora_requests *
 								                    args.num_prompts if req_data.lora_requests else None)
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								    with time_counter(args.time_generate):
 								        outputs = llm.generate(
 								            inputs,
 								            sampling_params=sampling_params,
 								            lora_request=lora_request,
 								        )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								    print("-" * 50)
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    for o in outputs:
 								        generated_text = o.outputs[0].text
 								        print(generated_text)
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								        print("-" * 50)
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								if __name__ == "__main__":
 								    parser = FlexibleArgumentParser(
 								        description='Demo on using vLLM for offline inference with '
-												[Model] Support E5-V (#9576)


											
										
										
											2024-10-23 11:35:29 +08:00
+								        'vision language models for text generation')
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    parser.add_argument('--model-type',
 								                        '-m',
 								                        type=str,
 								                        default="llava",
 								                        choices=model_example_map.keys(),
 								                        help='Huggingface "model_type".')
 								    parser.add_argument('--num-prompts',
 								                        type=int,
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								                        default=4,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								                        help='Number of prompts to run.')
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								    parser.add_argument('--modality',
 								                        type=str,
 								                        default="image",
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								                        choices=['image', 'video'],
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								                        help='Modality of the input.')
 								    parser.add_argument('--num-frames',
 								                        type=int,
 								                        default=16,
 								                        help='Number of frames to extract from the video.')
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    parser.add_argument("--seed",
 								                        type=int,
 								                        default=None,
 								                        help="Set the seed when initializing `vllm.LLM`.")
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
 								    parser.add_argument(
 								        '--image-repeat-prob',
 								        type=float,
 								        default=None,
 								        help='Simulates the hit-ratio for multi-modal preprocessor cache'
 								        ' (if enabled)')
 								    parser.add_argument(
-												[V1] VLM - enable processor cache by default (#11305)

Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
											
										
										
											2024-12-18 18:54:46 -05:00
+								        '--disable-mm-preprocessor-cache',
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								        action='store_true',
-												[V1] VLM - enable processor cache by default (#11305)

Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
											
										
										
											2024-12-18 18:54:46 -05:00
+								        help='If True, disables caching of multi-modal preprocessor/mapper.')
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
 								    parser.add_argument(
 								        '--time-generate',
 								        action='store_true',
 								        help='If True, then print the total generate() call time')
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    parser.add_argument(
 								        '--use-different-prompt-per-request',
 								        action='store_true',
 								        help='If True, then use different prompt (with the same multi-modal '
 								        'data) for each request.')
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    args = parser.parse_args()
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								    main(args)