[Frontend] Use a proper chat template for VLM2Vec (#9912)

This commit is contained in:
Cyrus Leung 2024-11-01 22:09:07 +08:00 committed by GitHub
parent 30a2e80742
commit ba0d892074
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 78 additions and 11 deletions

View File

@ -240,8 +240,7 @@ To consume the server, you can use the OpenAI client like in the example below:
) )
print("Chat completion output:", chat_response.choices[0].message.content) print("Chat completion output:", chat_response.choices[0].message.content)
A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
A full code example can be found in `examples/openai_api_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_api_client_for_multimodal.py>`_.
.. tip:: .. tip::
There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
@ -269,14 +268,19 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
.. code-block:: bash .. code-block:: bash
vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
--trust-remote-code --max-model-len 4096 --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
.. important:: .. important::
Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
to run this model in embedding mode instead of text generation mode. to run this model in embedding mode instead of text generation mode.
Since this schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: .. important::
VLM2Vec does not expect chat-based input. We use a `custom chat template <https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja>`_
to combine the text and images together.
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
.. code-block:: python .. code-block:: python
@ -301,3 +305,5 @@ Since this schema is not defined by OpenAI client, we post a request to the serv
response.raise_for_status() response.raise_for_status()
response_json = response.json() response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"]) print("Embedding output:", response_json["data"][0]["embedding"])
A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.

View File

@ -0,0 +1,33 @@
import requests
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
response = requests.post(
"http://localhost:8000/v1/embeddings",
json={
"model":
"TIGER-Lab/VLM2Vec-Full",
"messages": [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "Represent the given image."
},
],
}],
"encoding_format":
"float",
},
)
response.raise_for_status()
response_json = response.json()
print("Embedding output:", response_json["data"][0]["embedding"])

View File

@ -0,0 +1,16 @@
{%- if messages | length > 1 -%}
{{ raise_exception('Embedding models should only embed one message at a time') }}
{%- endif -%}
{% set vars = namespace(parts=[], next_image_id=1) %}
{%- for message in messages -%}
{%- for content in message['content'] -%}
{%- if content['type'] == 'text' -%}
{%- set vars.parts = vars.parts + [content['text']] %}
{%- elif content['type'] == 'image' -%}
{%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %}
{%- set vars.next_image_id = vars.next_image_id + 1 %}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}
{{ vars.parts | join(' ') }}

View File

@ -6,11 +6,14 @@ import requests
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import RemoteOpenAIServer from ...utils import VLLM_PATH, RemoteOpenAIServer
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2 MAXIMUM_IMAGES = 2
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@ -35,6 +38,8 @@ def server():
"--trust-remote-code", "--trust-remote-code",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}", f"image={MAXIMUM_IMAGES}",
"--chat-template",
str(vlm2vec_jinja_path),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@ -90,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
assert len(embeddings["data"]) == 1 assert len(embeddings["data"]) == 1
assert len(embeddings["data"][0]["embedding"]) == 3072 assert len(embeddings["data"][0]["embedding"]) == 3072
assert embeddings["usage"]["completion_tokens"] == 0 assert embeddings["usage"]["completion_tokens"] == 0
assert embeddings["usage"]["prompt_tokens"] == 771 assert embeddings["usage"]["prompt_tokens"] == 762
assert embeddings["usage"]["total_tokens"] == 771 assert embeddings["usage"]["total_tokens"] == 762

View File

@ -156,6 +156,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
self._items: List[_T] = [] self._items: List[_T] = []
@property
def model_config(self) -> ModelConfig:
return self._model_config
@staticmethod @staticmethod
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@ -491,10 +495,13 @@ def _parse_chat_message_content_parts(
content: List[Union[str, Dict[str, str]]] = [] content: List[Union[str, Dict[str, str]]] = []
mm_parser = mm_tracker.create_parser() mm_parser = mm_tracker.create_parser()
wrap_dicts = \ model_config = mm_tracker.model_config
mm_tracker._model_config.hf_config.model_type in \
MODEL_KEEP_MULTI_MODAL_CONTENT or \ wrap_dicts = (chat_template_text_format == "openai"
(chat_template_text_format == "openai") or (model_config.task == "embedding"
and model_config.is_multimodal_model)
or (model_config.hf_config.model_type
in MODEL_KEEP_MULTI_MODAL_CONTENT))
for part in parts: for part in parts:
parse_res = _parse_chat_message_content_part( parse_res = _parse_chat_message_content_part(