""" This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import random from transformers import AutoTokenizer from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.utils import FlexibleArgumentParser # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Aria def run_aria(question: str, modality: str): assert modality == "image" model_name = "rhymes-ai/Aria" llm = LLM(model=model_name, tokenizer_mode="slow", trust_remote_code=True, dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = (f"<|im_start|>user\n<|img|>\n{question}" "<|im_end|>\n<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return llm, prompt, stop_token_ids # BLIP-2 def run_blip2(question: str, modality: str): assert modality == "image" # BLIP-2 prompt format is inaccurate on HuggingFace model repository. # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompt = f"Question: {question} Answer:" llm = LLM(model="Salesforce/blip2-opt-2.7b", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids # Chameleon def run_chameleon(question: str, modality: str): assert modality == "image" prompt = f"{question}" llm = LLM(model="facebook/chameleon-7b", max_model_len=4096, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids # Fuyu def run_fuyu(question: str, modality: str): assert modality == "image" prompt = f"{question}\n" llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids # GLM-4v def run_glm4v(question: str, modality: str): assert modality == "image" model_name = "THUDM/glm-4v-9b" llm = LLM(model=model_name, max_model_len=2048, max_num_seqs=2, trust_remote_code=True, enforce_eager=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = question stop_token_ids = [151329, 151336, 151338] return llm, prompt, stop_token_ids # H2OVL-Mississippi def run_h2ovl(question: str, modality: str): assert modality == "image" model_name = "h2oai/h2ovl-mississippi-2b" llm = LLM( model=model_name, trust_remote_code=True, max_model_len=8192, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) messages = [{'role': 'user', 'content': f"\n{question}"}] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-2b stop_token_ids = [tokenizer.eos_token_id] return llm, prompt, stop_token_ids # Idefics3-8B-Llama3 def run_idefics3(question: str, modality: str): assert modality == "image" model_name = "HuggingFaceM4/Idefics3-8B-Llama3" llm = LLM( model=model_name, max_model_len=8192, max_num_seqs=2, enforce_eager=True, # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ "size": { "longest_edge": 3 * 364 }, }, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompt = ( f"<|begin_of_text|>User:{question}\nAssistant:" ) stop_token_ids = None return llm, prompt, stop_token_ids # InternVL def run_internvl(question: str, modality: str): assert modality == "image" model_name = "OpenGVLab/InternVL2-2B" llm = LLM( model=model_name, trust_remote_code=True, max_model_len=4096, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) messages = [{'role': 'user', 'content': f"\n{question}"}] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Stop tokens for InternVL # models variants may have different stop tokens # please refer to the model card for the correct "stop words": # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] return llm, prompt, stop_token_ids # LLaVA-1.5 def run_llava(question: str, modality: str): assert modality == "image" prompt = f"USER: \n{question}\nASSISTANT:" llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids # LLaVA-1.6/LLaVA-NeXT def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids # LlaVA-NeXT-Video # Currently only support for video input def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: