vllm/examples/offline_inference_vision_language_multi_image.py

96 lines
2.7 KiB
Python
Raw Normal View History

"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models, using the chat template defined
by the model.
"""
from argparse import Namespace
from typing import List
from vllm import LLM
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
]
def _load_phi3v(image_urls: List[str]):
return LLM(
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)
def run_phi3v_generate(question: str, image_urls: List[str]):
llm = _load_phi3v(image_urls)
placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": [fetch_image(url) for url in image_urls]
},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def run_phi3v_chat(question: str, image_urls: List[str]):
llm = _load_phi3v(image_urls)
outputs = llm.chat([{
"role":
"user",
"content": [
{
"type": "text",
"text": question,
},
*({
"type": "image_url",
"image_url": {
"url": image_url
},
} for image_url in image_urls),
],
}])
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main(args: Namespace):
method = args.method
if method == "generate":
run_phi3v_generate(QUESTION, IMAGE_URLS)
elif method == "chat":
run_phi3v_chat(QUESTION, IMAGE_URLS)
else:
raise ValueError(f"Invalid method: {method}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input')
parser.add_argument("--method",
type=str,
default="generate",
choices=["generate", "chat"],
help="The method to run in `vllm.LLM`.")
args = parser.parse_args()
main(args)