[Model] Add support for deepseek-vl2-tiny model (#12068)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
5fd24ec02e
commit
62b06ba23d
@ -618,7 +618,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
|||||||
* - `DeepseekVLV2ForCausalLM`
|
* - `DeepseekVLV2ForCausalLM`
|
||||||
- DeepSeek-VL2
|
- DeepSeek-VL2
|
||||||
- T + I<sup>+</sup>
|
- T + I<sup>+</sup>
|
||||||
- `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
|
- `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
|
||||||
-
|
-
|
||||||
- ✅︎
|
- ✅︎
|
||||||
- ✅︎
|
- ✅︎
|
||||||
@ -768,9 +768,8 @@ See [this page](#generative-models) for more information on how to use generativ
|
|||||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||||
|
|
||||||
````{note}
|
````{note}
|
||||||
The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
|
|
||||||
|
|
||||||
To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
|
To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
|
pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
|
||||||
```
|
```
|
||||||
|
@ -70,7 +70,7 @@ def run_chameleon(question: str, modality: str):
|
|||||||
def run_deepseek_vl2(question: str, modality: str):
|
def run_deepseek_vl2(question: str, modality: str):
|
||||||
assert modality == "image"
|
assert modality == "image"
|
||||||
|
|
||||||
model_name = "deepseek-ai/deepseek-vl2-small"
|
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
llm = LLM(model=model_name,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
|
@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
|
|||||||
|
|
||||||
|
|
||||||
def load_deepseek_vl2(question: str, image_urls: List[str]):
|
def load_deepseek_vl2(question: str, image_urls: List[str]):
|
||||||
model_name = "deepseek-ai/deepseek-vl2-small"
|
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||||
|
|
||||||
llm = LLM(model=model_name,
|
llm = LLM(model=model_name,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
|
@ -9,6 +9,7 @@ from typing import Type
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoModelForVision2Seq
|
from transformers import AutoModelForVision2Seq
|
||||||
|
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||||
from transformers.utils import is_flash_attn_2_available
|
from transformers.utils import is_flash_attn_2_available
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@ -189,30 +190,27 @@ VLM_TEST_SETTINGS = {
|
|||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
),
|
),
|
||||||
"deepseek_vl_v2": VLMTestInfo(
|
"deepseek_vl_v2": VLMTestInfo(
|
||||||
models=["deepseek-ai/deepseek-vl2-small"],
|
models=["deepseek-ai/deepseek-vl2-tiny"],
|
||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
dtype="bfloat16",
|
|
||||||
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||||
"stop_sign": "<image>\nWhat's the color of the stop sign and car?",
|
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501
|
||||||
"cherry_blossom": "<image>\nWhat's the color of the tower?",
|
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
|
||||||
}),
|
}),
|
||||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nDescribe the two images shortly.", # noqa: E501
|
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||||
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
|
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
|
||||||
image_size_factors=[(0.10, 0.15)],
|
|
||||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
|
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
|
||||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
||||||
num_logprobs=5,
|
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||||
marks=[
|
marks=[
|
||||||
pytest.mark.skipif(
|
pytest.mark.skipif(
|
||||||
not is_flash_attn_2_available(),
|
TRANSFORMERS_VERSION >= "4.48.0",
|
||||||
reason="Model needs flash-attn for numeric convergence.",
|
reason="HF model is not compatible with transformers>=4.48.0",
|
||||||
),
|
)
|
||||||
large_gpu_mark(min_gb=48),
|
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
"fuyu": VLMTestInfo(
|
"fuyu": VLMTestInfo(
|
||||||
|
@ -181,8 +181,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
|
"ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
|
||||||
is_available_online=False),
|
is_available_online=False),
|
||||||
# TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
|
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"), # noqa: E501
|
||||||
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"), # noqa: E501
|
|
||||||
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
|
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
|
||||||
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
|
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
|
||||||
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
|
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
|
||||||
|
@ -356,13 +356,18 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
|
f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self.text_config.topk_method == "noaux_tc":
|
||||||
|
architectures = ["DeepseekV3ForCausalLM"]
|
||||||
|
elif not self.text_config.use_mla:
|
||||||
|
architectures = ["DeepseekForCausalLM"]
|
||||||
|
else:
|
||||||
|
architectures = ["DeepseekV2ForCausalLM"]
|
||||||
|
|
||||||
self.language_model = init_vllm_registered_model(
|
self.language_model = init_vllm_registered_model(
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
hf_config=self.text_config,
|
hf_config=self.text_config,
|
||||||
prefix=maybe_prefix(prefix, "language"),
|
prefix=maybe_prefix(prefix, "language"),
|
||||||
architectures=["DeepseekV3ForCausalLM"]
|
architectures=architectures,
|
||||||
if self.text_config.topk_method == "noaux_tc" else
|
|
||||||
["DeepseekV2ForCausalLM"],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.make_empty_intermediate_tensors = (
|
self.make_empty_intermediate_tensors = (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user