[Model] Add support for deepseek-vl2-tiny model (#12068)

Signed-off-by: Isotr0py <2037008807@qq.com>
2025-01-17 01:14:48 +08:00 · 2025-01-17 01:14:48 +08:00 · 62b06ba23d
commit 62b06ba23d
parent 5fd24ec02e
6 changed files with 22 additions and 21 deletions
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -618,7 +618,7 @@ See [this page](#generative-models) for more information on how to use generativ
 * - `DeepseekVLV2ForCausalLM`
  - DeepSeek-VL2
  - T + I<sup>+</sup>
-  - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
  -
  - ✅︎
  - ✅︎
@ -768,9 +768,8 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 ````{note}
 The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
 To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
 ```shell
 pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
 ```
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -70,7 +70,7 @@ def run_chameleon(question: str, modality: str):
 def run_deepseek_vl2(question: str, modality: str):
    assert modality == "image"
-    model_name = "deepseek-ai/deepseek-vl2-small"
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
    llm = LLM(model=model_name,
              max_model_len=4096,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
 def load_deepseek_vl2(question: str, image_urls: List[str]):
-    model_name = "deepseek-ai/deepseek-vl2-small"
+    model_name = "deepseek-ai/deepseek-vl2-tiny"
    llm = LLM(model=model_name,
              max_model_len=4096,
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@ -9,6 +9,7 @@ from typing import Type
 import pytest
 from transformers import AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.utils import is_flash_attn_2_available
 from vllm.platforms import current_platform
@ -189,30 +190,27 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
    ),
    "deepseek_vl_v2": VLMTestInfo(
-        models=["deepseek-ai/deepseek-vl2-small"],
+        models=["deepseek-ai/deepseek-vl2-tiny"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        dtype="bfloat16",
        prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<image>\nWhat's the color of the stop sign and car?",
+            "stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
-            "cherry_blossom": "<image>\nWhat's the color of the tower?",
+            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
        }),
-        multi_image_prompt="image_1:<image>\nimage_2:<image>\nDescribe the two images shortly.",    # noqa: E501
+        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
        image_size_factors=[(0.10, 0.15)],
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
        stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
-        num_logprobs=5,
+        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
        marks=[
            pytest.mark.skipif(
-                not is_flash_attn_2_available(),
+                TRANSFORMERS_VERSION >= "4.48.0",
-                reason="Model needs flash-attn for numeric convergence.",
+                reason="HF model is not compatible with transformers>=4.48.0",
-            ),
+            )
            large_gpu_mark(min_gb=48),
        ],
    ),
    "fuyu": VLMTestInfo(
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -181,8 +181,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                    trust_remote_code=True),
    "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
                                                       is_available_online=False),
-    # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
+    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"),   # noqa: E501
    "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"),   # noqa: E501
    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@ -356,13 +356,18 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
            )
        if self.text_config.topk_method == "noaux_tc":
            architectures = ["DeepseekV3ForCausalLM"]
        elif not self.text_config.use_mla:
            architectures = ["DeepseekForCausalLM"]
        else:
            architectures = ["DeepseekV2ForCausalLM"]
        self.language_model = init_vllm_registered_model(
            vllm_config=vllm_config,
            hf_config=self.text_config,
            prefix=maybe_prefix(prefix, "language"),
-            architectures=["DeepseekV3ForCausalLM"]
+            architectures=architectures,
            if self.text_config.topk_method == "noaux_tc" else
            ["DeepseekV2ForCausalLM"],
        )
        self.make_empty_intermediate_tensors = (