96 lines
2.7 KiB
Python
96 lines
2.7 KiB
Python
![]() |
"""
|
||
|
This example shows how to use vLLM for running offline inference with
|
||
|
multi-image input on vision language models, using the chat template defined
|
||
|
by the model.
|
||
|
"""
|
||
|
from argparse import Namespace
|
||
|
from typing import List
|
||
|
|
||
|
from vllm import LLM
|
||
|
from vllm.multimodal.utils import fetch_image
|
||
|
from vllm.utils import FlexibleArgumentParser
|
||
|
|
||
|
QUESTION = "What is the content of each image?"
|
||
|
IMAGE_URLS = [
|
||
|
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
|
||
|
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
|
||
|
]
|
||
|
|
||
|
|
||
|
def _load_phi3v(image_urls: List[str]):
|
||
|
return LLM(
|
||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||
|
trust_remote_code=True,
|
||
|
max_model_len=4096,
|
||
|
limit_mm_per_prompt={"image": len(image_urls)},
|
||
|
)
|
||
|
|
||
|
|
||
|
def run_phi3v_generate(question: str, image_urls: List[str]):
|
||
|
llm = _load_phi3v(image_urls)
|
||
|
|
||
|
placeholders = "\n".join(f"<|image_{i}|>"
|
||
|
for i, _ in enumerate(image_urls, start=1))
|
||
|
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||
|
|
||
|
outputs = llm.generate({
|
||
|
"prompt": prompt,
|
||
|
"multi_modal_data": {
|
||
|
"image": [fetch_image(url) for url in image_urls]
|
||
|
},
|
||
|
})
|
||
|
|
||
|
for o in outputs:
|
||
|
generated_text = o.outputs[0].text
|
||
|
print(generated_text)
|
||
|
|
||
|
|
||
|
def run_phi3v_chat(question: str, image_urls: List[str]):
|
||
|
llm = _load_phi3v(image_urls)
|
||
|
|
||
|
outputs = llm.chat([{
|
||
|
"role":
|
||
|
"user",
|
||
|
"content": [
|
||
|
{
|
||
|
"type": "text",
|
||
|
"text": question,
|
||
|
},
|
||
|
*({
|
||
|
"type": "image_url",
|
||
|
"image_url": {
|
||
|
"url": image_url
|
||
|
},
|
||
|
} for image_url in image_urls),
|
||
|
],
|
||
|
}])
|
||
|
|
||
|
for o in outputs:
|
||
|
generated_text = o.outputs[0].text
|
||
|
print(generated_text)
|
||
|
|
||
|
|
||
|
def main(args: Namespace):
|
||
|
method = args.method
|
||
|
|
||
|
if method == "generate":
|
||
|
run_phi3v_generate(QUESTION, IMAGE_URLS)
|
||
|
elif method == "chat":
|
||
|
run_phi3v_chat(QUESTION, IMAGE_URLS)
|
||
|
else:
|
||
|
raise ValueError(f"Invalid method: {method}")
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parser = FlexibleArgumentParser(
|
||
|
description='Demo on using vLLM for offline inference with '
|
||
|
'vision language models that support multi-image input')
|
||
|
parser.add_argument("--method",
|
||
|
type=str,
|
||
|
default="generate",
|
||
|
choices=["generate", "chat"],
|
||
|
help="The method to run in `vllm.LLM`.")
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
main(args)
|