23 lines
697 B
Python
23 lines
697 B
Python
from vllm import LLM
|
|
from vllm.assets.image import ImageAsset
|
|
|
|
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
|
prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501
|
|
|
|
# Create an LLM.
|
|
llm = LLM(
|
|
model="TIGER-Lab/VLM2Vec-Full",
|
|
task="embedding",
|
|
trust_remote_code=True,
|
|
max_model_len=4096,
|
|
max_num_seqs=2,
|
|
mm_processor_kwargs={"num_crops": 16},
|
|
)
|
|
|
|
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
|
outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
|
|
|
|
# Print the outputs.
|
|
for output in outputs:
|
|
print(output.outputs.embedding) # list of 3072 floats
|