[Model] support input embeddings for qwen2vl (#8856)

2024-09-30 11:16:10 +08:00 · 2024-09-30 11:16:10 +08:00 · e01ab595d8
commit e01ab595d8
parent f13a07b1f8
3 changed files with 135 additions and 70 deletions
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@ -281,7 +281,7 @@ Multimodal Language Models
    -
  * - :code:`Qwen2VLForConditionalGeneration`
    - Qwen2-VL
-    - Image\ :sup:`+` / Video\ :sup:`+`
+    - Image\ :sup:`E+` / Video\ :sup:`+`
    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
    -
  * - :code:`UltravoxModel`
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@ -57,6 +57,23 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
        "multi_modal_data": {"image": image_embeds},
    })
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
    # Inference with image embeddings as input with additional parameters
    # Specifically, we are conducting a trial run of Qwen2VL with the new input format, as the model utilizes additional parameters for calculating positional encoding.
    image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
    image_grid_thw = torch.load(...) # torch.Tensor of shape (1, 3)
    mm_data['image'] = {
        "image_embeds": image_embeds,
        "image_grid_thw":  image_grid_thw,
    }
    outputs = llm.generate({
        "prompt": prompt,
        "multi_modal_data": mm_data,
    })
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@ -23,8 +23,8 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import lru_cache, partial
-from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict,
+from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Union)
+                    Tuple, Type, TypedDict, Union)
 import torch
 import torch.nn as nn
@ -76,19 +76,31 @@ logger = init_logger(__name__)
 # === Vision Inputs === #
-class Qwen2VLImageInputs(TypedDict):
+class Qwen2VLImagePixelInputs(TypedDict):
-    pixel_values: torch.Tensor
+    type: Literal["pixel_values"]
    data: torch.Tensor
    """Shape: 
    `(num_patches, num_channels * patch_size * patch_size)`
    """
    image_grid_thw: torch.Tensor
    """Shape: `(num_images, 3)`
    This should be in `(grid_t, grid_h, grid_w)` format.
    """
 class Qwen2VLImageEmbeddingInputs(TypedDict):
    type: Literal["image_embeds"]
    data: torch.Tensor
    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
    `hidden_size` must match the hidden size of language model backbone.
    """
 Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs,
                           Qwen2VLImageEmbeddingInputs]
 class Qwen2VLVideoInputs(TypedDict):
    pixel_values_videos: torch.Tensor
    """Shape: 
@ -567,6 +579,11 @@ def mm_input_mapper_for_qwen2_vl(
    data_type_key: str,
 ) -> MultiModalInputs:
    """Input mapper for Qwen2-VL."""
    if data_type_key == "image" and isinstance(data, dict):
        return MultiModalInputs({
            "image_embeds": data.get("image_embeds"),
            "image_grid_thw": data.get("image_grid_thw"),
        })
    model_config = ctx.model_config
    image_processor = cached_get_image_processor(
        model_config.model, trust_remote_code=model_config.trust_remote_code)
@ -739,6 +756,48 @@ def _get_llm_num_vision_tokens(
    return llm_num_vision_tokens
 def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable,
                       data_type_key: str, image_processor: Any,
                       prompt_token_ids: List[int]) -> List[int]:
    """
    Expand pad tokens for multi-modal inputs (e.g., images or videos).
    Args:
        inputs (list): The multi-modal inputs (e.g., images or videos).
        token_id (int): The token ID used to represent the multi-modal input.
        make_batched_fn (Callable): A function to batch the inputs.
        data_type_key (str): The type of the multi-modal input.
        image_processor (Any): The image processor used to process the inputs.
        prompt_token_ids (List[int]): The list of token IDs in the prompt.
    Returns:
        List[int]: The list of token IDs for the multi-modal inputs.
    """
    indices = [
        idx for idx, token in enumerate(prompt_token_ids) if token == token_id
    ]
    inputs = make_batched_fn(inputs)
    assert len(indices) == len(inputs)
    prompt_token_ids_with_data = []
    for cnt, data in enumerate(inputs):
        num_tokens = _get_llm_num_vision_tokens(
            [data] if data_type_key == "image" else data,
            data_type_key=data_type_key,
            image_processor=image_processor,
        )
        if cnt == 0:
            end_idx = indices[cnt]
            non_data_tokens = prompt_token_ids[:end_idx]
        else:
            non_data_tokens = prompt_token_ids[indices[cnt - 1] +
                                               1:indices[cnt]]
        prompt_token_ids_with_data.extend(non_data_tokens)
        prompt_token_ids_with_data.extend(token_id for _ in range(num_tokens))
    prompt_token_ids_with_data.extend(prompt_token_ids[indices[-1] + 1:])
    return prompt_token_ids_with_data
 def input_processor_for_qwen2_vl(ctx: InputContext,
                                 llm_inputs: LLMInputs) -> LLMInputs:
    multi_modal_data = llm_inputs.get("multi_modal_data", None)
@ -775,62 +834,38 @@ def input_processor_for_qwen2_vl(ctx: InputContext,
        )["input_ids"]
    # Expand image pad tokens.
    if image_inputs is not None:
        if isinstance(image_inputs, dict):
            prompt_token_ids_with_image = []
            image_indices = [
                idx for idx, token in enumerate(prompt_token_ids)
                if token == hf_config.image_token_id
            ]
-        image_inputs = make_batched_images(image_inputs)
+            image_cnt = len(image_indices)
-        assert len(image_indices) == len(image_inputs)
+            embed_dim = image_inputs.get('image_embeds').size(0)
-
+            assert embed_dim % image_cnt == 0
-        prompt_token_ids_with_image = []
+            num_pad_tokens = embed_dim // image_cnt
-        for image_cnt, image in enumerate(image_inputs):
+            for idx, token in enumerate(prompt_token_ids):
-            num_image_tokens = _get_llm_num_vision_tokens(
+                if idx in image_indices:
-                [image],
+                    prompt_token_ids_with_image.extend([token] *
-                data_type_key="image",
+                                                       num_pad_tokens)
                image_processor=image_processor,
            )
            if image_cnt == 0:
                non_image_tokens = prompt_token_ids[:image_indices[image_cnt]]
                else:
-                non_image_tokens = prompt_token_ids[image_indices[image_cnt -
+                    prompt_token_ids_with_image.append(token)
                                                                  1] +
                                                    1:image_indices[image_cnt]]
            prompt_token_ids_with_image.extend(non_image_tokens)
            prompt_token_ids_with_image.extend(
                hf_config.image_token_id for _ in range(num_image_tokens))
        prompt_token_ids_with_image.extend(prompt_token_ids[image_indices[-1] +
                                                            1:])
            prompt_token_ids = prompt_token_ids_with_image
    # Expand video pad tokens.
    if video_inputs is not None:
        video_indices = [
            idx for idx, token in enumerate(prompt_token_ids)
            if token == hf_config.video_token_id
        ]
        video_inputs = make_batched_videos(video_inputs)
        assert len(video_indices) == len(video_inputs)
        prompt_token_ids_with_video = []
        for video_cnt, video in enumerate(video_inputs):
            num_video_tokens = _get_llm_num_vision_tokens(
                video,
                data_type_key="video",
                image_processor=image_processor,
            )
            if video_cnt == 0:
                non_video_tokens = prompt_token_ids[:video_indices[video_cnt]]
        else:
-                non_video_tokens = prompt_token_ids[video_indices[video_cnt -
+            prompt_token_ids = _expand_pad_tokens(image_inputs,
-                                                                  1] +
+                                                  hf_config.image_token_id,
-                                                    1:video_indices[video_cnt]]
+                                                  make_batched_images, "image",
-            prompt_token_ids_with_video.extend(non_video_tokens)
+                                                  image_processor,
-            prompt_token_ids_with_video.extend(
+                                                  prompt_token_ids)
-                hf_config.video_token_id for _ in range(num_video_tokens))
+
-        prompt_token_ids_with_video.extend(prompt_token_ids[video_indices[-1] +
+    if video_inputs is not None:
-                                                            1:])
+        prompt_token_ids = _expand_pad_tokens(video_inputs,
-        prompt_token_ids = prompt_token_ids_with_video
+                                              hf_config.video_token_id,
                                              make_batched_videos, "video",
                                              image_processor,
                                              prompt_token_ids)
    return LLMInputs(
        prompt_token_ids=prompt_token_ids,
@ -910,11 +945,13 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
    def _parse_and_validate_image_input(
            self, **kwargs: object) -> Optional[Qwen2VLImageInputs]:
        pixel_values = kwargs.pop("pixel_values", None)
        image_embeds = kwargs.pop("image_embeds", None)
        image_grid_thw = kwargs.pop("image_grid_thw", None)
-        if pixel_values is None:
+        if pixel_values is None and image_embeds is None:
            return None
        if pixel_values is not None:
            pixel_values = self._validate_and_reshape_mm_tensor(
                pixel_values, "image pixel values")
            image_grid_thw = self._validate_and_reshape_mm_tensor(
@ -924,9 +961,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
                raise ValueError("Incorrect type of image pixel values. "
                                 f"Got type: {type(pixel_values)}")
-        return Qwen2VLImageInputs(pixel_values=pixel_values,
+            return Qwen2VLImagePixelInputs(type="pixel_values",
                                           data=pixel_values,
                                           image_grid_thw=image_grid_thw)
        if image_embeds is not None:
            if not isinstance(image_embeds, torch.Tensor):
                raise ValueError("Incorrect type of image embeddings. "
                                 f"Got type: {type(image_embeds)}")
            return Qwen2VLImageEmbeddingInputs(type="image_embeds",
                                               data=image_embeds)
    def _parse_and_validate_video_input(
            self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]:
        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
@ -947,7 +992,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
    def _process_image_input(self,
                             image_input: Qwen2VLImageInputs) -> torch.Tensor:
-        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+        if image_input["type"] == "image_embeds":
            return image_input["data"].type(self.visual.dtype)
        pixel_values = image_input["data"].type(self.visual.dtype)
        image_embeds = self.visual(pixel_values,
                                   grid_thw=image_input["image_grid_thw"])
        return image_embeds