# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os import random from dataclasses import asdict from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): engine_args: EngineArgs prompts: list[str] stop_token_ids: Optional[list[int]] = None lora_requests: Optional[list[LoRARequest]] = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Aria def run_aria(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "rhymes-ai/Aria" # NOTE: Need L40 (or equivalent) to avoid OOM engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompts = [(f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") for question in questions] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # BLIP-2 def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" # BLIP-2 prompt format is inaccurate on HuggingFace model repository. # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompts = [f"Question: {question} Answer:" for question in questions] engine_args = EngineArgs( model="Salesforce/blip2-opt-2.7b", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Chameleon def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}" for question in questions] engine_args = EngineArgs( model="facebook/chameleon-7b", max_model_len=4096, max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Deepseek-VL2 def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "deepseek-ai/deepseek-vl2-tiny" engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, ) prompts = [ f"<|User|>: \n{question}\n\n<|Assistant|>:" for question in questions ] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Florence2 def run_florence2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" engine_args = EngineArgs( model="microsoft/Florence-2-large", tokenizer="facebook/bart-large", max_num_seqs=8, trust_remote_code=True, dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompts = ["" for _ in questions] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Fuyu def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"{question}\n" for question in questions] engine_args = EngineArgs( model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # Gemma 3 def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "google/gemma-3-4b-it" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, mm_processor_kwargs={"do_pan_and_scan": True}, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompts = [("user\n" f"{question}\n" "model\n") for question in questions] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # GLM-4v def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "THUDM/glm-4v-9b" engine_args = EngineArgs( model=model_name, max_model_len=2048, max_num_seqs=2, trust_remote_code=True, enforce_eager=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompts = [ f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ {question}<|assistant|>" for question in questions ] stop_token_ids = [151329, 151336, 151338] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # H2OVL-Mississippi def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "h2oai/h2ovl-mississippi-800m" engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=8192, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) messages = [[{ 'role': 'user', 'content': f"\n{question}" }] for question in questions] prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m stop_token_ids = [tokenizer.eos_token_id] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # Idefics3-8B-Llama3 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "HuggingFaceM4/Idefics3-8B-Llama3" engine_args = EngineArgs( model=model_name, max_model_len=8192, max_num_seqs=2, enforce_eager=True, # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ "size": { "longest_edge": 3 * 364 }, }, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompts = [( f"<|begin_of_text|>User:{question}\nAssistant:" ) for question in questions] return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" model_name = "OpenGVLab/InternVL2-2B" engine_args = EngineArgs( model=model_name, trust_remote_code=True, max_model_len=4096, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) messages = [[{ 'role': 'user', 'content': f"\n{question}" }] for question in questions] prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Stop tokens for InternVL # models variants may have different stop tokens # please refer to the model card for the correct "stop words": # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] return ModelRequestData( engine_args=engine_args, prompts=prompts, stop_token_ids=stop_token_ids, ) # LLaVA-1.5 def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [ f"USER: \n{question}\nASSISTANT:" for question in questions ] engine_args = EngineArgs( model="llava-hf/llava-1.5-7b-hf", max_model_len=4096, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # LLaVA-1.6/LLaVA-NeXT def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" prompts = [f"[INST] \n{question} [/INST]" for question in questions] engine_args = EngineArgs( model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) return ModelRequestData( engine_args=engine_args, prompts=prompts, ) # LlaVA-NeXT-Video # Currently only support for video input def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestData: assert modality == "video" prompts = [ f"USER: