2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-02-27 18:06:41 +08:00
|
|
|
"""
|
2024-10-23 18:42:47 +08:00
|
|
|
Demonstrate prompting of text-to-text
|
|
|
|
encoder/decoder models, specifically Florence-2
|
2025-02-27 18:06:41 +08:00
|
|
|
"""
|
2024-10-23 18:42:47 +08:00
|
|
|
# TODO(Isotr0py):
|
2025-01-10 15:50:29 +00:00
|
|
|
# Move to offline_inference/vision_language.py
|
2025-01-08 13:09:53 +00:00
|
|
|
# after porting vision backbone
|
2024-10-23 18:42:47 +08:00
|
|
|
from vllm import LLM, SamplingParams
|
2025-02-27 18:06:41 +08:00
|
|
|
from vllm.assets.image import ImageAsset
|
2024-10-23 18:42:47 +08:00
|
|
|
|
|
|
|
# Create a Florence-2 encoder/decoder model instance
|
|
|
|
llm = LLM(
|
2025-02-27 18:06:41 +08:00
|
|
|
model="microsoft/Florence-2-large",
|
|
|
|
tokenizer="facebook/bart-large",
|
|
|
|
max_num_seqs=8,
|
2024-10-23 18:42:47 +08:00
|
|
|
trust_remote_code=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
prompts = [
|
2025-02-27 18:06:41 +08:00
|
|
|
{ # implicit prompt with task token
|
|
|
|
"prompt": "<DETAILED_CAPTION>",
|
|
|
|
"multi_modal_data": {
|
|
|
|
"image": ImageAsset("stop_sign").pil_image
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{ # explicit encoder/decoder prompt
|
|
|
|
"encoder_prompt": {
|
|
|
|
"prompt": "Describe in detail what is shown in the image.",
|
|
|
|
"multi_modal_data": {
|
|
|
|
"image": ImageAsset("cherry_blossom").pil_image
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"decoder_prompt": "",
|
|
|
|
},
|
2024-10-23 18:42:47 +08:00
|
|
|
]
|
|
|
|
# Create a sampling params object.
|
|
|
|
sampling_params = SamplingParams(
|
|
|
|
temperature=0,
|
|
|
|
top_p=1.0,
|
|
|
|
min_tokens=0,
|
2025-02-27 18:06:41 +08:00
|
|
|
max_tokens=128,
|
2024-10-23 18:42:47 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
# Generate output tokens from the prompts. The output is a list of
|
|
|
|
# RequestOutput objects that contain the prompt, generated
|
|
|
|
# text, and other information.
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
|
|
|
|
|
|
# Print the outputs.
|
|
|
|
for output in outputs:
|
|
|
|
generated_text = output.outputs[0].text
|
2025-02-27 18:06:41 +08:00
|
|
|
print(f"Generated text: {generated_text!r}")
|